feat: initial commit.

This commit is contained in:
Yi Pan
2024-07-09 10:34:19 +08:00
commit f0a44579c4
3 changed files with 192 additions and 0 deletions

65
.clangd Normal file
View File

@ -0,0 +1,65 @@
# https://clangd.llvm.org/config
# Apply a config conditionally to all C files
If:
PathMatch: .*\.(c|h)$
---
# Apply a config conditionally to all C++ files
If:
PathMatch: .*\.(c|h)pp
---
# Apply a config conditionally to all CUDA files
If:
PathMatch: .*\.cuh?
CompileFlags:
Add:
# Allow variadic CUDA functions
- "-Xclang=-fcuda-allow-variadic-functions"
---
# Tweak the clangd parse settings for all files
CompileFlags:
CompilationDatabase: .
Add:
- -x
- cuda
# report all errors
- "-std=c++17"
- "-ferror-limit=0"
- "-ftemplate-backtrace-limit=0"
- "-I/usr/local/lib/python3.10/dist-packages/torch/include"
- "-I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include"
- "-DGLOG_USE_GLOG_EXPORT"
Remove:
- -stdpar
# strip CUDA fatbin args
- "-Xfatbin*"
- "-Xcompiler*"
- "-Xcudafe*"
- "-rdc=*"
- "-gpu=*"
- "--diag_suppress*"
# strip CUDA arch flags
- "-gencode*"
- "--generate-code*"
# strip gcc's -fcoroutines
- -fcoroutines
# strip CUDA flags unknown to clang
- "-ccbin*"
- "--compiler-options*"
- "--expt-extended-lambda"
- "--expt-relaxed-constexpr"
- "-forward-unknown-to-host-compiler"
- "-Werror=cross-execution-space-call"
Diagnostics:
Suppress:
- "variadic_device_fn"
- "attributes_not_allowed"
# The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error.
# Temporarily suppressing it, but should probably fix
- "template_param_shadow"

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.cache/
build/
**/report*
*.out

122
csrc/basic.cu Normal file
View File

@ -0,0 +1,122 @@
#include <cassert>
#include <chrono>
#include <cstdio>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <random>
// You may increase this value to test larger matrices
// But it will be slow on CPU
constexpr int MAXN = 2048;
/**
* @brief A naive implementation of matrix multiplication on CPU.
* Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
*/
void naiveSgemm(float *a, float *b, float *c, const int M, const int N,
const int K) {
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
float sum = 0.0;
for (int k = 0; k < K; ++k) {
sum += a[m * K + k] * b[k * N + n];
}
c[m * N + n] = sum;
}
}
}
/**
* @brief A naive implementation of matrix multiplication on GPU.
* Perform C = A * B, where A is M x K, B is K x N, and C is M x N.
*/
__global__ void naiveSgemm2D(float *a, float *b, float *c, const int M,
const int N, const int K) {
int m = blockIdx.x * blockDim.x + threadIdx.x; // Row index
int n = blockIdx.y * blockDim.y + threadIdx.y; // Column index
if (m < M && n < N) {
float sum = 0.0;
for (int k = 0; k < K; ++k) {
sum += a[m * K + k] * b[k * N + n];
}
c[m * N + n] = sum;
}
}
/**
* @brief Launch naiveSgemm2D kernel.
*/
void launchSgemm2D(float *a, float *b, float *c, const int M, const int N,
const int K) {
dim3 block(16, 16); // 256 threads per block (16 * 16 = 256)
dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
naiveSgemm2D<<<grid, block>>>(a, b, c, M, N, K);
}
void initialize(float *a, float *b, float *c, const int M, const int N,
const int K) {
auto gen = std::mt19937(2024);
auto dis = std::uniform_real_distribution<float>(-1.0, 1.0);
for (int i = 0; i < M * K; ++i) {
a[i] = dis(gen);
}
for (int i = 0; i < K * N; ++i) {
b[i] = dis(gen);
}
for (int i = 0; i < M * N; ++i) {
c[i] = 0.0;
}
}
/**
* @brief Launch sgemm using cuBLAS
*/
void launchCublasSgemm(float *a, float *b, float *c, const int M, const int N,
const int K) {
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, b, N, a, K,
&beta, c, N);
}
int main() {
float *a, *b, *c;
a = new float[MAXN * MAXN];
b = new float[MAXN * MAXN];
c = new float[MAXN * MAXN];
initialize(a, b, c, MAXN, MAXN, MAXN);
// ********** CPU **********
auto start = std::chrono::high_resolution_clock::now();
naiveSgemm(a, b, c, MAXN, MAXN, MAXN);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
printf("CPU time: %.3fs\n", elapsed.count());
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, MAXN * MAXN * sizeof(float));
cudaMalloc(&d_b, MAXN * MAXN * sizeof(float));
cudaMalloc(&d_c, MAXN * MAXN * sizeof(float));
cudaMemcpy(d_a, a, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, MAXN * MAXN * sizeof(float), cudaMemcpyHostToDevice);
// ********** GPU **********
start = std::chrono::high_resolution_clock::now();
launchSgemm2D(d_a, d_b, d_c, MAXN, MAXN, MAXN);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
printf("GPU time: %.3fs\n", elapsed.count());
// ********** cuBLAS **********
start = std::chrono::high_resolution_clock::now();
launchCublasSgemm(d_a, d_b, d_c, MAXN, MAXN, MAXN);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
printf("cuBLAS time: %.3fs\n", elapsed.count());
}