feat(week1): add docs and task for week 1.

This commit is contained in:
Yi Pan
2024-07-09 11:45:35 +08:00
parent f0a44579c4
commit 66430108a4
5 changed files with 256 additions and 87 deletions

32
Makefile Normal file
View File

@ -0,0 +1,32 @@
CC = nvcc
CFLAGS = -I/usr/local/lib/python3.10/dist-packages/torch/include \
-I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include \
-lcublas
HELLO = build/hello
HELLO_SRC = csrc/hello_world.cu
BASIC = build/basic
BASIC_SRC = csrc/basic.cu
GEMM = build/gemm
GEMM_SRC = csrc/gemm.cu
all: $(HELLO) $(BASIC)
$(HELLO): $(HELLO_SRC)
mkdir -p build
$(CC) $(CFLAGS) $^ -o $@
$(BASIC): $(BASIC_SRC)
mkdir -p build
$(CC) $(CFLAGS) $^ -o $@
$(GEMM): $(GEMM_SRC)
mkdir -p build
$(CC) $(CFLAGS) $^ -o $@
clean:
rm -rf build
.PHONY: all clean

42
README.md Normal file
View File

@ -0,0 +1,42 @@
# CUDA Playground
This is the CUDA playground for PPCA AI project. In this optional part, you will learn how to use CUDA to accelerate your code.
## Setup
We highly recommend to use the docker image provided by NVIDIA. You can use the following command to attach to the docker image in the lab server.
```bash
ssh root@IP -p 2233
```
The IP address and password is the same as the one you use to connect to the server. Note that all the students share the same server, so please be nice and do not destroy the environment.
After you successfully connect to the server, you can clone the repository and run the example code.
```bash
git clone https://github.com/Conless/cuda-playground
cd cuda-playground
make
./build/basic
```
If you see the following output, then you have successfully set up the environment.
```
Hello from thread 0
Hello from thread 1
Hello from thread 2
Hello from thread 3
Hello from thread 4
Hello from thread 5
Hello from thread 6
Hello from thread 7
```
## Task
### Week 1
Learn the basic concept of CUDA and implement the following files.
- `src/basic.cu`
- `src/gemm.cu`

View File

@ -7,116 +7,68 @@
// You may increase this value to test larger matrices
// But it will be slow on CPU
constexpr int MAXN = 2048;
constexpr int MAXN = 1 << 28;
/**
* @brief A naive implementation of matrix multiplication on CPU.
* Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
*/
void naiveSgemm(float *a, float *b, float *c, const int M, const int N,
const int K) {
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
float sum = 0.0;
for (int k = 0; k < K; ++k) {
sum += a[m * K + k] * b[k * N + n];
}
c[m * N + n] = sum;
}
void vectorAddCPU(float *a, float *b, float *c, const int N) {
for (int i = 0; i < N; ++i) {
c[i] = a[i] + b[i];
}
}
/**
* @brief A naive implementation of matrix multiplication on GPU.
* Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
*/
__global__ void naiveSgemm2D(float *a, float *b, float *c, const int M,
const int N, const int K) {
int m = blockIdx.x * blockDim.x + threadIdx.x; // Row index
int n = blockIdx.y * blockDim.y + threadIdx.y; // Column index
if (m < M && n < N) {
float sum = 0.0;
for (int k = 0; k < K; ++k) {
sum += a[m * K + k] * b[k * N + n];
}
c[m * N + n] = sum;
}
}
/**
* @brief Launch naiveSgemm2D kernel.
*/
void launchSgemm2D(float *a, float *b, float *c, const int M, const int N,
const int K) {
dim3 block(16, 16); // 256 threads per block (16 * 16 = 256)
dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
naiveSgemm2D<<<grid, block>>>(a, b, c, M, N, K);
}
void initialize(float *a, float *b, float *c, const int M, const int N,
const int K) {
void initialize(float *a, float *b, const int N) {
auto gen = std::mt19937(2024);
auto dis = std::uniform_real_distribution<float>(-1.0, 1.0);
for (int i = 0; i < M * K; ++i) {
for (int i = 0; i < N; ++i) {
a[i] = dis(gen);
}
for (int i = 0; i < K * N; ++i) {
for (int i = 0; i < N; ++i) {
b[i] = dis(gen);
}
for (int i = 0; i < M * N; ++i) {
c[i] = 0.0;
}
}
/**
* @brief Launch sgemm using cuBLAS
*/
void launchCublasSgemm(float *a, float *b, float *c, const int M, const int N,
const int K) {
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, b, N, a, K,
&beta, c, N);
bool compare(float *a, float *b, const int N) {
for (int i = 0; i < N; ++i) {
if (std::abs(a[i] - b[i]) > 1e-3) {
printf("Mismatch at index %d: %f vs %f\n", i, a[i], b[i]);
return false;
}
}
printf("Results match\n");
return true;
}
__global__ void vectorAddGPU(float *a, float *b, float *c, const int N) {
// Implement your vector add kernel here
}
int main() {
float *a, *b, *c;
a = new float[MAXN * MAXN];
b = new float[MAXN * MAXN];
c = new float[MAXN * MAXN];
initialize(a, b, c, MAXN, MAXN, MAXN);
a = new float[MAXN];
b = new float[MAXN];
c = new float[MAXN];
initialize(a, b, MAXN);
// ********** CPU **********
// CPU computation
auto start = std::chrono::high_resolution_clock::now();
naiveSgemm(a, b, c, MAXN, MAXN, MAXN);
vectorAddCPU(a, b, c, MAXN);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
printf("CPU time: %.3fs\n", elapsed.count());
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, MAXN * MAXN * sizeof(float));
cudaMalloc(&d_b, MAXN * MAXN * sizeof(float));
cudaMalloc(&d_c, MAXN * MAXN * sizeof(float));
cudaMemcpy(d_a, a, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
// ************** START GPU MEMORY ALLOCATION **************
// Implement your code here
// ********** GPU **********
// ************** START GPU COMPUTATION **************
start = std::chrono::high_resolution_clock::now();
launchSgemm2D(d_a, d_b, d_c, MAXN, MAXN, MAXN);
cudaDeviceSynchronize();
// Implement your code here
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
printf("GPU time: %.3fs\n", elapsed.count());
// ********** cuBLAS **********
start = std::chrono::high_resolution_clock::now();
launchCublasSgemm(d_a, d_b, d_c, MAXN, MAXN, MAXN);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
printf("cuBLAS time: %.3fs\n", elapsed.count());
float *result = new float[MAXN];
// Copy the result from GPU to CPU
if (compare(c, result, MAXN)) {
std::chrono::duration<double> new_elapsed = end - start;
printf("GPU time: %.3fs\n", new_elapsed.count());
printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count());
}
}

122
csrc/gemm.cu Normal file
View File

@ -0,0 +1,122 @@
#include <cassert>
#include <chrono>
#include <cstdio>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <random>
// You may increase this value to test larger matrices
// But it will be slow on CPU
constexpr int MAXN = 2048;
/**
* @brief A naive implementation of matrix multiplication on CPU.
* Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
*/
void naiveSgemm(float *a, float *b, float *c, const int M, const int N,
const int K) {
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
float sum = 0.0;
for (int k = 0; k < K; ++k) {
sum += a[m * K + k] * b[k * N + n];
}
c[m * N + n] = sum;
}
}
}
/**
* @brief A naive implementation of matrix multiplication on GPU.
* Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
*/
__global__ void naiveSgemm2D(float *a, float *b, float *c, const int M,
const int N, const int K) {
int m = blockIdx.x * blockDim.x + threadIdx.x; // Row index
int n = blockIdx.y * blockDim.y + threadIdx.y; // Column index
if (m < M && n < N) {
float sum = 0.0;
for (int k = 0; k < K; ++k) {
sum += a[m * K + k] * b[k * N + n];
}
c[m * N + n] = sum;
}
}
/**
* @brief Launch naiveSgemm2D kernel.
*/
void launchSgemm2D(float *a, float *b, float *c, const int M, const int N,
const int K) {
dim3 block(16, 16); // 256 threads per block (16 * 16 = 256)
dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
naiveSgemm2D<<<grid, block>>>(a, b, c, M, N, K);
}
void initialize(float *a, float *b, float *c, const int M, const int N,
const int K) {
auto gen = std::mt19937(2024);
auto dis = std::uniform_real_distribution<float>(-1.0, 1.0);
for (int i = 0; i < M * K; ++i) {
a[i] = dis(gen);
}
for (int i = 0; i < K * N; ++i) {
b[i] = dis(gen);
}
for (int i = 0; i < M * N; ++i) {
c[i] = 0.0;
}
}
/**
* @brief Launch sgemm using cuBLAS
*/
void launchCublasSgemm(float *a, float *b, float *c, const int M, const int N,
const int K) {
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, b, N, a, K,
&beta, c, N);
}
int main() {
float *a, *b, *c;
a = new float[MAXN * MAXN];
b = new float[MAXN * MAXN];
c = new float[MAXN * MAXN];
initialize(a, b, c, MAXN, MAXN, MAXN);
// ********** CPU **********
auto start = std::chrono::high_resolution_clock::now();
naiveSgemm(a, b, c, MAXN, MAXN, MAXN);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
printf("CPU time: %.3fs\n", elapsed.count());
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, MAXN * MAXN * sizeof(float));
cudaMalloc(&d_b, MAXN * MAXN * sizeof(float));
cudaMalloc(&d_c, MAXN * MAXN * sizeof(float));
cudaMemcpy(d_a, a, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
// ********** GPU **********
start = std::chrono::high_resolution_clock::now();
launchSgemm2D(d_a, d_b, d_c, MAXN, MAXN, MAXN);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
printf("GPU time: %.3fs\n", elapsed.count());
// ********** cuBLAS **********
start = std::chrono::high_resolution_clock::now();
launchCublasSgemm(d_a, d_b, d_c, MAXN, MAXN, MAXN);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
printf("cuBLAS time: %.3fs\n", elapsed.count());
}

21
csrc/hello_world.cu Normal file
View File

@ -0,0 +1,21 @@
#include <cstdio>
#include <cuda_runtime.h>
__global__ void hello() {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < 8; i++) {
if (tid == i) {
printf("Hello from thread %d\n", tid);
}
__syncthreads();
}
}
int main() {
int nthreads = 8;
int nblocks = 1;
hello<<<nblocks, nthreads>>>();
cudaDeviceSynchronize();
return 0;
}