feat(week1): add docs and task for week 1.

2024-07-09 11:45:35 +08:00
parent f0a44579c4
commit 66430108a4
5 changed files with 256 additions and 87 deletions
--- a/32
+++ b/32
@ -0,0 +1,32 @@
+CC = nvcc
+CFLAGS = -I/usr/local/lib/python3.10/dist-packages/torch/include \
+				 -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include \
+				 -lcublas
+
+HELLO = build/hello
+HELLO_SRC = csrc/hello_world.cu
+
+BASIC = build/basic
+BASIC_SRC = csrc/basic.cu
+
+GEMM = build/gemm
+GEMM_SRC = csrc/gemm.cu
+
+all: $(HELLO) $(BASIC)
+
+$(HELLO): $(HELLO_SRC)
+	mkdir -p build
+	$(CC) $(CFLAGS) $^ -o $@
+
+$(BASIC): $(BASIC_SRC)
+	mkdir -p build
+	$(CC) $(CFLAGS) $^ -o $@ 
+
+$(GEMM): $(GEMM_SRC)
+	mkdir -p build
+	$(CC) $(CFLAGS) $^ -o $@
+
+clean:
+	rm -rf build
+
+.PHONY: all clean
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
+# CUDA Playground
+
+This is the CUDA playground for PPCA AI project. In this optional part, you will learn how to use CUDA to accelerate your code.
+
+## Setup
+
+We highly recommend to use the docker image provided by NVIDIA. You can use the following command to attach to the docker image in the lab server.
+
+```bash
+ssh root@IP -p 2233
+```
+
+The IP address and password is the same as the one you use to connect to the server. Note that all the students share the same server, so please be nice and do not destroy the environment.
+
+After you successfully connect to the server, you can clone the repository and run the example code.
+
+```bash
+git clone https://github.com/Conless/cuda-playground
+cd cuda-playground
+make
+./build/basic
+```
+
+If you see the following output, then you have successfully set up the environment.
+
+```
+Hello from thread 0
+Hello from thread 1
+Hello from thread 2
+Hello from thread 3
+Hello from thread 4
+Hello from thread 5
+Hello from thread 6
+Hello from thread 7
+```
+
+## Task
+### Week 1
+
+Learn the basic concept of CUDA and implement the following files.
+- `src/basic.cu`
+- `src/gemm.cu`
--- a/csrc/basic.cu
+++ b/csrc/basic.cu
@ -7,116 +7,68 @@

 // You may increase this value to test larger matrices
 // But it will be slow on CPU
-constexpr int MAXN = 2048;
+constexpr int MAXN = 1 << 28;

-/**
- * @brief A naive implementation of matrix multiplication on CPU.
- * Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
- */
-void naiveSgemm(float *a, float *b, float *c, const int M, const int N,
-                const int K) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      float sum = 0.0;
-      for (int k = 0; k < K; ++k) {
-        sum += a[m * K + k] * b[k * N + n];
-      }
-      c[m * N + n] = sum;
-    }
+void vectorAddCPU(float *a, float *b, float *c, const int N) {
+  for (int i = 0; i < N; ++i) {
+    c[i] = a[i] + b[i];
  }
 }

-/**
- * @brief A naive implementation of matrix multiplication on GPU.
- * Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
- */
-__global__ void naiveSgemm2D(float *a, float *b, float *c, const int M,
-                             const int N, const int K) {
-  int m = blockIdx.x * blockDim.x + threadIdx.x; // Row index
-  int n = blockIdx.y * blockDim.y + threadIdx.y; // Column index
-  if (m < M && n < N) {
-    float sum = 0.0;
-    for (int k = 0; k < K; ++k) {
-      sum += a[m * K + k] * b[k * N + n];
-    }
-    c[m * N + n] = sum;
-  }
-}
-
-/**
- * @brief Launch naiveSgemm2D kernel.
- */
-void launchSgemm2D(float *a, float *b, float *c, const int M, const int N,
-                   const int K) {
-  dim3 block(16, 16); // 256 threads per block (16 * 16 = 256)
-  dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
-  naiveSgemm2D<<<grid, block>>>(a, b, c, M, N, K);
-}
-
-void initialize(float *a, float *b, float *c, const int M, const int N,
-                const int K) {
+void initialize(float *a, float *b, const int N) {
  auto gen = std::mt19937(2024);
  auto dis = std::uniform_real_distribution<float>(-1.0, 1.0);
-  for (int i = 0; i < M * K; ++i) {
+  for (int i = 0; i < N; ++i) {
    a[i] = dis(gen);
  }
-  for (int i = 0; i < K * N; ++i) {
+  for (int i = 0; i < N; ++i) {
    b[i] = dis(gen);
  }
-  for (int i = 0; i < M * N; ++i) {
-    c[i] = 0.0;
+}
+
+bool compare(float *a, float *b, const int N) {
+  for (int i = 0; i < N; ++i) {
+    if (std::abs(a[i] - b[i]) > 1e-3) {
+      printf("Mismatch at index %d: %f vs %f\n", i, a[i], b[i]);
+      return false;
+    }
  }
+  printf("Results match\n");
+  return true;
 }

-/** 
- * @brief Launch sgemm using cuBLAS
- */
-void launchCublasSgemm(float *a, float *b, float *c, const int M, const int N,
-                       const int K) {
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-  float alpha = 1.0;
-  float beta = 0.0;
-  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, b, N, a, K,
-              &beta, c, N);
+__global__ void vectorAddGPU(float *a, float *b, float *c, const int N) {
+  // Implement your vector add kernel here
+  
 }

-
 int main() {
  float *a, *b, *c;
-  a = new float[MAXN * MAXN];
-  b = new float[MAXN * MAXN];
-  c = new float[MAXN * MAXN];
-  initialize(a, b, c, MAXN, MAXN, MAXN);
+  a = new float[MAXN];
+  b = new float[MAXN];
+  c = new float[MAXN];
+  initialize(a, b, MAXN);

-  // ********** CPU **********
+  // CPU computation
  auto start = std::chrono::high_resolution_clock::now();
-  naiveSgemm(a, b, c, MAXN, MAXN, MAXN);
+  vectorAddCPU(a, b, c, MAXN);
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> elapsed = end - start;
  printf("CPU time: %.3fs\n", elapsed.count());

-  float *d_a, *d_b, *d_c;
-  cudaMalloc(&d_a, MAXN * MAXN * sizeof(float));
-  cudaMalloc(&d_b, MAXN * MAXN * sizeof(float));
-  cudaMalloc(&d_c, MAXN * MAXN * sizeof(float));
-  cudaMemcpy(d_a, a, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_b, b, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_c, c, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
-
-  // ********** GPU **********
+  // ************** START GPU MEMORY ALLOCATION **************
+  // Implement your code here
+  
+  // ************** START GPU COMPUTATION **************
  start = std::chrono::high_resolution_clock::now();
-  launchSgemm2D(d_a, d_b, d_c, MAXN, MAXN, MAXN);
-  cudaDeviceSynchronize();
+  // Implement your code here
  end = std::chrono::high_resolution_clock::now();
-  elapsed = end - start;
-  printf("GPU time: %.3fs\n", elapsed.count());

-  // ********** cuBLAS **********
-  start = std::chrono::high_resolution_clock::now();
-  launchCublasSgemm(d_a, d_b, d_c, MAXN, MAXN, MAXN);
-  cudaDeviceSynchronize();
-  end = std::chrono::high_resolution_clock::now();
-  elapsed = end - start;
-  printf("cuBLAS time: %.3fs\n", elapsed.count());
-}
+  float *result = new float[MAXN];
+  // Copy the result from GPU to CPU
+  if (compare(c, result, MAXN)) {
+    std::chrono::duration<double> new_elapsed = end - start;
+    printf("GPU time: %.3fs\n", new_elapsed.count());
+    printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count());
+  }
+}
--- a/csrc/gemm.cu
+++ b/csrc/gemm.cu
@ -0,0 +1,122 @@
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <random>
+
+// You may increase this value to test larger matrices
+// But it will be slow on CPU
+constexpr int MAXN = 2048;
+
+/**
+ * @brief A naive implementation of matrix multiplication on CPU.
+ * Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
+ */
+void naiveSgemm(float *a, float *b, float *c, const int M, const int N,
+                const int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      float sum = 0.0;
+      for (int k = 0; k < K; ++k) {
+        sum += a[m * K + k] * b[k * N + n];
+      }
+      c[m * N + n] = sum;
+    }
+  }
+}
+
+/**
+ * @brief A naive implementation of matrix multiplication on GPU.
+ * Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
+ */
+__global__ void naiveSgemm2D(float *a, float *b, float *c, const int M,
+                             const int N, const int K) {
+  int m = blockIdx.x * blockDim.x + threadIdx.x; // Row index
+  int n = blockIdx.y * blockDim.y + threadIdx.y; // Column index
+  if (m < M && n < N) {
+    float sum = 0.0;
+    for (int k = 0; k < K; ++k) {
+      sum += a[m * K + k] * b[k * N + n];
+    }
+    c[m * N + n] = sum;
+  }
+}
+
+/**
+ * @brief Launch naiveSgemm2D kernel.
+ */
+void launchSgemm2D(float *a, float *b, float *c, const int M, const int N,
+                   const int K) {
+  dim3 block(16, 16); // 256 threads per block (16 * 16 = 256)
+  dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
+  naiveSgemm2D<<<grid, block>>>(a, b, c, M, N, K);
+}
+
+void initialize(float *a, float *b, float *c, const int M, const int N,
+                const int K) {
+  auto gen = std::mt19937(2024);
+  auto dis = std::uniform_real_distribution<float>(-1.0, 1.0);
+  for (int i = 0; i < M * K; ++i) {
+    a[i] = dis(gen);
+  }
+  for (int i = 0; i < K * N; ++i) {
+    b[i] = dis(gen);
+  }
+  for (int i = 0; i < M * N; ++i) {
+    c[i] = 0.0;
+  }
+}
+
+/** 
+ * @brief Launch sgemm using cuBLAS
+ */
+void launchCublasSgemm(float *a, float *b, float *c, const int M, const int N,
+                       const int K) {
+  cublasHandle_t handle;
+  cublasCreate(&handle);
+  float alpha = 1.0;
+  float beta = 0.0;
+  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, b, N, a, K,
+              &beta, c, N);
+}
+
+
+int main() {
+  float *a, *b, *c;
+  a = new float[MAXN * MAXN];
+  b = new float[MAXN * MAXN];
+  c = new float[MAXN * MAXN];
+  initialize(a, b, c, MAXN, MAXN, MAXN);
+
+  // ********** CPU **********
+  auto start = std::chrono::high_resolution_clock::now();
+  naiveSgemm(a, b, c, MAXN, MAXN, MAXN);
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = end - start;
+  printf("CPU time: %.3fs\n", elapsed.count());
+
+  float *d_a, *d_b, *d_c;
+  cudaMalloc(&d_a, MAXN * MAXN * sizeof(float));
+  cudaMalloc(&d_b, MAXN * MAXN * sizeof(float));
+  cudaMalloc(&d_c, MAXN * MAXN * sizeof(float));
+  cudaMemcpy(d_a, a, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, b, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
+
+  // ********** GPU **********
+  start = std::chrono::high_resolution_clock::now();
+  launchSgemm2D(d_a, d_b, d_c, MAXN, MAXN, MAXN);
+  cudaDeviceSynchronize();
+  end = std::chrono::high_resolution_clock::now();
+  elapsed = end - start;
+  printf("GPU time: %.3fs\n", elapsed.count());
+
+  // ********** cuBLAS **********
+  start = std::chrono::high_resolution_clock::now();
+  launchCublasSgemm(d_a, d_b, d_c, MAXN, MAXN, MAXN);
+  cudaDeviceSynchronize();
+  end = std::chrono::high_resolution_clock::now();
+  elapsed = end - start;
+  printf("cuBLAS time: %.3fs\n", elapsed.count());
+}
--- a/csrc/hello_world.cu
+++ b/csrc/hello_world.cu
@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void hello() {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < 8; i++) {
+    if (tid == i) {
+      printf("Hello from thread %d\n", tid);
+    }
+    __syncthreads();
+  }
+}
+
+int main() {
+  int nthreads = 8;
+  int nblocks = 1;
+  hello<<<nblocks, nthreads>>>();
+  cudaDeviceSynchronize();
+  return 0;
+}