From f0a44579c4ef8290de62e5f1d30a2f941266e217 Mon Sep 17 00:00:00 2001 From: Yi Pan Date: Tue, 9 Jul 2024 10:34:19 +0800 Subject: [PATCH] feat: initial commit. --- .clangd | 65 +++++++++++++++++++++++++++ .gitignore | 5 +++ csrc/basic.cu | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 .clangd create mode 100644 .gitignore create mode 100644 csrc/basic.cu diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..dab5436 --- /dev/null +++ b/.clangd @@ -0,0 +1,65 @@ +# https://clangd.llvm.org/config + +# Apply a config conditionally to all C files +If: + PathMatch: .*\.(c|h)$ + +--- + +# Apply a config conditionally to all C++ files +If: + PathMatch: .*\.(c|h)pp + +--- + +# Apply a config conditionally to all CUDA files +If: + PathMatch: .*\.cuh? +CompileFlags: + Add: + # Allow variadic CUDA functions + - "-Xclang=-fcuda-allow-variadic-functions" + +--- + +# Tweak the clangd parse settings for all files +CompileFlags: + CompilationDatabase: . + Add: + - -x + - cuda + # report all errors + - "-std=c++17" + - "-ferror-limit=0" + - "-ftemplate-backtrace-limit=0" + - "-I/usr/local/lib/python3.10/dist-packages/torch/include" + - "-I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include" + - "-DGLOG_USE_GLOG_EXPORT" + Remove: + - -stdpar + # strip CUDA fatbin args + - "-Xfatbin*" + - "-Xcompiler*" + - "-Xcudafe*" + - "-rdc=*" + - "-gpu=*" + - "--diag_suppress*" + # strip CUDA arch flags + - "-gencode*" + - "--generate-code*" + # strip gcc's -fcoroutines + - -fcoroutines + # strip CUDA flags unknown to clang + - "-ccbin*" + - "--compiler-options*" + - "--expt-extended-lambda" + - "--expt-relaxed-constexpr" + - "-forward-unknown-to-host-compiler" + - "-Werror=cross-execution-space-call" +Diagnostics: + Suppress: + - "variadic_device_fn" + - "attributes_not_allowed" + # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error. + # Temporarily suppressing it, but should probably fix + - "template_param_shadow" \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..df205bb --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.cache/ +build/ + +**/report* +*.out \ No newline at end of file diff --git a/csrc/basic.cu b/csrc/basic.cu new file mode 100644 index 0000000..3656185 --- /dev/null +++ b/csrc/basic.cu @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include + +// You may increase this value to test larger matrices +// But it will be slow on CPU +constexpr int MAXN = 2048; + +/** + * @brief A naive implementation of matrix multiplication on CPU. + * Perform C = A * B, where A is M x K, B is K x N, and C is M x N. + */ +void naiveSgemm(float *a, float *b, float *c, const int M, const int N, + const int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.0; + for (int k = 0; k < K; ++k) { + sum += a[m * K + k] * b[k * N + n]; + } + c[m * N + n] = sum; + } + } +} + +/** + * @brief A naive implementation of matrix multiplication on GPU. + * Perform C = A * B, where A is M x K, B is K x N, and C is M x N. + */ +__global__ void naiveSgemm2D(float *a, float *b, float *c, const int M, + const int N, const int K) { + int m = blockIdx.x * blockDim.x + threadIdx.x; // Row index + int n = blockIdx.y * blockDim.y + threadIdx.y; // Column index + if (m < M && n < N) { + float sum = 0.0; + for (int k = 0; k < K; ++k) { + sum += a[m * K + k] * b[k * N + n]; + } + c[m * N + n] = sum; + } +} + +/** + * @brief Launch naiveSgemm2D kernel. + */ +void launchSgemm2D(float *a, float *b, float *c, const int M, const int N, + const int K) { + dim3 block(16, 16); // 256 threads per block (16 * 16 = 256) + dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y); + naiveSgemm2D<<>>(a, b, c, M, N, K); +} + +void initialize(float *a, float *b, float *c, const int M, const int N, + const int K) { + auto gen = std::mt19937(2024); + auto dis = std::uniform_real_distribution(-1.0, 1.0); + for (int i = 0; i < M * K; ++i) { + a[i] = dis(gen); + } + for (int i = 0; i < K * N; ++i) { + b[i] = dis(gen); + } + for (int i = 0; i < M * N; ++i) { + c[i] = 0.0; + } +} + +/** + * @brief Launch sgemm using cuBLAS + */ +void launchCublasSgemm(float *a, float *b, float *c, const int M, const int N, + const int K) { + cublasHandle_t handle; + cublasCreate(&handle); + float alpha = 1.0; + float beta = 0.0; + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, b, N, a, K, + &beta, c, N); +} + + +int main() { + float *a, *b, *c; + a = new float[MAXN * MAXN]; + b = new float[MAXN * MAXN]; + c = new float[MAXN * MAXN]; + initialize(a, b, c, MAXN, MAXN, MAXN); + + // ********** CPU ********** + auto start = std::chrono::high_resolution_clock::now(); + naiveSgemm(a, b, c, MAXN, MAXN, MAXN); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + printf("CPU time: %.3fs\n", elapsed.count()); + + float *d_a, *d_b, *d_c; + cudaMalloc(&d_a, MAXN * MAXN * sizeof(float)); + cudaMalloc(&d_b, MAXN * MAXN * sizeof(float)); + cudaMalloc(&d_c, MAXN * MAXN * sizeof(float)); + cudaMemcpy(d_a, a, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, b, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_c, c, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice); + + // ********** GPU ********** + start = std::chrono::high_resolution_clock::now(); + launchSgemm2D(d_a, d_b, d_c, MAXN, MAXN, MAXN); + cudaDeviceSynchronize(); + end = std::chrono::high_resolution_clock::now(); + elapsed = end - start; + printf("GPU time: %.3fs\n", elapsed.count()); + + // ********** cuBLAS ********** + start = std::chrono::high_resolution_clock::now(); + launchCublasSgemm(d_a, d_b, d_c, MAXN, MAXN, MAXN); + cudaDeviceSynchronize(); + end = std::chrono::high_resolution_clock::now(); + elapsed = end - start; + printf("cuBLAS time: %.3fs\n", elapsed.count()); +}