From c440358d9e8d7fb027a98da3a5dc958e4365624a Mon Sep 17 00:00:00 2001
From: ZhuangYumin <zhuangyumin@sjtu.edu.cn>
Date: Wed, 10 Jul 2024 07:46:27 +0800
Subject: [PATCH] basic

---
 csrc/basic.cu | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)
diff --git a/csrc/basic.cu b/csrc/basic.cu
index d91c2ce..f708859 100644
--- a/csrc/basic.cu
+++ b/csrc/basic.cu
@@ -39,7 +39,10 @@ bool compare(float *a, float *b, const int N) {
 
 __global__ void vectorAddGPU(float *a, float *b, float *c, const int N) {
   // Implement your vector add kernel here
-  
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < N) {
+    c[idx] = a[idx] + b[idx];
+  }
 }
 
 int main() {
@@ -58,17 +61,39 @@ int main() {
 
   // ************** START GPU MEMORY ALLOCATION **************
   // Implement your code here
-  
+  float *d_a, *d_b, *d_c;
+  cudaMalloc(&d_a, MAXN * sizeof(float));
+  cudaMalloc(&d_b, MAXN * sizeof(float));
+  cudaMalloc(&d_c, MAXN * sizeof(float));
+
+  cudaMemcpy(d_a, a, MAXN * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, b, MAXN * sizeof(float), cudaMemcpyHostToDevice);
   // ************** START GPU COMPUTATION **************
   start = std::chrono::high_resolution_clock::now();
-  // Implement your code here
+  int threadsPerBlock = 512;
+  int blocksPerGrid = (MAXN + threadsPerBlock - 1) / threadsPerBlock;
+  vectorAddGPU<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, MAXN);
+  cudaDeviceSynchronize();
   end = std::chrono::high_resolution_clock::now();
 
   float *result = new float[MAXN];
   // Copy the result from GPU to CPU
+  cudaMemcpy(result, d_c, MAXN * sizeof(float), cudaMemcpyDeviceToHost);
   if (compare(c, result, MAXN)) {
     std::chrono::duration<double> new_elapsed = end - start;
     printf("GPU time: %.3fs\n", new_elapsed.count());
     printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count());
   }
+  // Free GPU memory
+  cudaFree(d_a);
+  cudaFree(d_b);
+  cudaFree(d_c);
+
+  // Free CPU memory
+  delete[] a;
+  delete[] b;
+  delete[] c;
+  delete[] result;
+
+  return 0;
 }
\ No newline at end of file