Assignment 02: GPU Architecture and cuTile

The file assignments/02_assignment/src/__main__.py contains the main function that runs all the tasks for this assignment. Each task is implemented in a separate file in the same directory. The results of each task are printed to the console when the main function is executed.

Task 1: GPU Device Properties

import cupy as cp

def main():
    print("CUDA Device Attributes:" )
    for key, value in cp.cuda.Device().attributes.items():
        if key in ["L2CacheSize", "MaxSharedMemoryPerMultiprocessor", "ClockRate"]:
            print(f"\t{key}: {value}")

if __name__ == "__main__":
    main()

Output:

CUDA Device Attributes:
        ClockRate: 2418000
        L2CacheSize: 25165824
        MaxSharedMemoryPerMultiprocessor: 102400

Task 2: Matrix Reduction Kernel

a) cuTile kernel that reduces a 2D input matrix of arbitrary shape (M, K) along its last dimension (K)

import torch
import cuda.tile as ct
import math

def main():
    M = 65
    K = 33
    
    # Calculate the next power of two for K
    TILE_K = int(2**math.ceil(math.log2(K))) 
    
    A = torch.randn(M, K, device='cuda')
    A_reduced = torch.empty(M, device='cuda')

    grid = (M,)

    # Launch kernel, passing TILE_K as the constant
    ct.launch(torch.cuda.current_stream().cuda_stream,
        grid, 
        matrix_reduce,
        (A, A_reduced, TILE_K))

    expected = torch.sum(A, dim=1)
    assert torch.allclose(A_reduced, expected), "Task 2 failed: A_reduced does not match expected result!"

@ct.kernel
def matrix_reduce(A, A_reduced, TILE_K: ct.Constant[int]):
    pid = ct.bid(0)

    # Use the power-of-two TILE_K for the shape. 
    # The padding mode handles the difference between K (33) and TILE_K (64).
    A_row = ct.load(
            A, 
            index=(pid, 0), 
            shape=(1, TILE_K), 
            padding_mode=ct.PaddingMode.ZERO
        )
    
    result = ct.sum(A_row, axis=1)

    ct.store(A_reduced, index=(pid, ), tile=result)

if __name__ == "__main__":
    main()

b) As M increases, the parallelization increases as more Streaming Multiprocessors (SM) are active. The theoratically performace maximum is when all SM’s are used. The per-kernel-process load increases with K, because every thread needs to load more data from the memory and performs more operations. The increase is non-linear due to the requirement that tile sizes must be powers of 2. Any K that is not a power of 2 requires zero-padding to the next power of 2, which introduces computational overhead.

Task 3: 4D Tensor Elementwise Addition

a) cuTile kernel that adds two 4D tensors A and B element-wise and stores the result in C. All tensors have identical shape and dimensions (M, N, K, L).

import torch
import cuda.tile as ct

def main():
    M, N, K, L = 16, 128, 16, 128
    size = (M, N, K, L)
    A = torch.randn(size, device='cuda')
    B = torch.randn(size, device='cuda')
    C_kl = torch.empty(size, device='cuda')
    C_mn = torch.empty(size, device='cuda')
    grid_kl = (M, N)
    grid_mn = (K, L)
    ct.launch(torch.cuda.current_stream().cuda_stream,
        grid_kl, 
        tensor_add_KL,
        (A, B, C_kl, K, L),)
    ct.launch(torch.cuda.current_stream().cuda_stream,
        grid_mn, 
        tensor_add_MN,
        (A, B, C_mn, M, N),)
    
    expected = A + B
    assert torch.allclose(C_kl, expected), "Task 3 KL failed: C_kl does not match expected result!"
    assert torch.allclose(C_mn, expected), "Task 3 MN failed: C_mn does not match expected result!"

def tensor_add(A, B, C, index, shape):
    A_block = ct.load(
            A, 
            index=index,
            shape=shape,
        )
    B_block = ct.load(
            B, 
            index=index,
            shape=shape,
        )

    result = A_block + B_block

    ct.store(C, index=index, tile=result)

@ct.kernel
def tensor_add_KL(A, B, C, K: ct.Constant[int], L: ct.Constant[int]):
    bid_m = ct.bid(0)
    bid_n = ct.bid(1)
    # Each block processes the full (K, L) sub-tensor at a specific (m, n)
    index = (bid_m, bid_n, 0, 0)
    shape = (1, 1, K, L)
    tensor_add(A, B, C, index, shape)

@ct.kernel
def tensor_add_MN(A, B, C, M: ct.Constant[int], N: ct.Constant[int]):
    bid_k = ct.bid(0)
    bid_l = ct.bid(1)
    
    # Each block processes the full (M, N) sub-tensor at a specific (k, l)
    index = (0, 0, bid_k, bid_l)
    shape = (M, N, 1, 1)
    
    tensor_add(A, B, C, index, shape)    

if __name__ == "__main__":
    main()

b) Benchmark

import triton
import torch
import cuda.tile as ct
from task3 import tensor_add_KL, tensor_add_MN

M, N, K, L = 16, 128, 16, 128
size = (M, N, K, L)

def main():
    global A, B, C
    A = torch.randn(size, device='cuda')
    B = torch.randn(size, device='cuda')
    C = torch.empty(size, device='cuda')

    t = triton.testing.do_bench(tensor_add_KL_benchmark)
    print(f"tensor_add_KL benchmark: {t:.2f} ms")
    C = torch.empty(size, device='cuda')
    t = triton.testing.do_bench(tensor_add_MN_benchmark)
    print(f"tensor_add_MN benchmark: {t:.2f} ms")
    
def tensor_add_KL_benchmark():
    grid_kl = (M, N)
    ct.launch(torch.cuda.current_stream().cuda_stream,
        grid_kl, 
        tensor_add_KL,
        (A, B, C, K, L),)

def tensor_add_MN_benchmark():
    grid_mn = (K, L)
    ct.launch(torch.cuda.current_stream().cuda_stream,
        grid_mn, 
        tensor_add_MN,
        (A, B, C, M, N),)


if __name__ == "__main__":
    main()

Output:

tensor_add_KL benchmark: 0.39 ms
tensor_add_MN benchmark: 0.67 ms

The tensor_add_KL kernel is faster than the tensor_add_MN kernel. This is because the data accessed per kernel program is continuous in memory.

Task 4: Benchmarking Bandwidth

a) cuTile kernel that copies a 2D matrix of shape (M,N)

import torch
import cuda.tile as ct

# Note: *args will prevent positional arguments
def main(*args, M = 2048, N = 16, tile_M = 64, tile_N = None, dtype=torch.float32):
    if tile_N is None:
        tile_N = N

    A = torch.randn((M, N), device='cuda', dtype=dtype)
    B = torch.empty_like(A, device='cuda', dtype=dtype)
    
    assert M % tile_M == 0, "M must be divisible by tile_M"
    assert N % tile_N == 0, "N must be divisible by tile_N"
    grid = (M // tile_M, N // tile_N)

    ct.launch(torch.cuda.current_stream().cuda_stream,
        grid, 
        matrix_copy,
        (A, B, tile_M, tile_N))
    
    assert torch.allclose(B, A), "Task 4 failed: B does not match A!"

@ct.kernel
def matrix_copy(A, B, tile_M: ct.Constant[int], tile_N: ct.Constant[int]):
    bid_m = ct.bid(0)
    bid_n = ct.bid(1)
    index = (bid_m, bid_n)

    A_block = ct.load(
            A, 
            index=index,
            shape=(tile_M, tile_N),
        )
    
    ct.store(B, index=index, tile=A_block)

if __name__ == "__main__":
    main()

b) Benchmarking

import triton
import matplotlib.pyplot as plt
from pathlib import Path
import torch

from task4 import main as main_task4
def main():
    N = [16, 32, 64, 128, 256, 512, 1024, 2048]
    M = 2048
    dtype = torch.float32
    bandwidths = []
    for n in N:
        t = triton.testing.do_bench(lambda: main_task4(M=M, N=n, tile_M=64, tile_N=n, dtype=dtype))
        print(f"matrix_copy benchmark for N={n}: {t:.2f} ms")
        element_size = torch.tensor([], dtype=dtype).element_size()
        bandwidth = 2 * M * n * element_size / (t * 1e6)  # GB/s
        bandwidths.append(bandwidth)

    # plot results
    plt.plot(N, bandwidths, marker='o')
    plt.xlabel('N (number of columns)')
    plt.ylabel('Bandwidth (GB/s)')
    plt.title('matrix_copy Benchmark: Bandwidth vs N')
    plt.xscale('log', base=2)
    plt.xticks(N)
    plt.grid(True)

    file_dir = Path(__file__).parent
    plt.savefig(file_dir / 'task4_benchmark.png')

if __name__ == "__main__":
    main()

Output:

matrix_copy benchmark for N=16: 0.08 ms
matrix_copy benchmark for N=32: 0.09 ms
matrix_copy benchmark for N=64: 0.10 ms
matrix_copy benchmark for N=128: 0.12 ms
matrix_copy benchmark for N=256: 0.16 ms
matrix_copy benchmark for N=512: 0.28 ms
matrix_copy benchmark for N=1024: 0.62 ms
matrix_copy benchmark for N=2048: 1.32 ms

alt text Because the bandwidth was still increasing at N=128, we included a few more points up to N=2048 to show the trend more clearly. The bandwidth increases as N increases, which is expected because larger matrices allow for better utilization of the GPU’s memory bandwidth. However, the bandwidth reaches its peak at around N=512 and then starts to decrease. Probably this is the point we reach hardware limits of the GPU.