basic

2024-07-10 07:46:27 +08:00
parent 66430108a4
commit c440358d9e
1 changed files with 28 additions and 3 deletions
--- a/csrc/basic.cu
+++ b/csrc/basic.cu
@@ -39,7 +39,10 @@ bool compare(float *a, float *b, const int N) {
 __global__ void vectorAddGPU(float *a, float *b, float *c, const int N) {
  // Implement your vector add kernel here
-  
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    c[idx] = a[idx] + b[idx];
  }
 }
 int main() {
@@ -58,17 +61,39 @@ int main() {
  // ************** START GPU MEMORY ALLOCATION **************
  // Implement your code here
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, MAXN * sizeof(float));
  cudaMalloc(&d_b, MAXN * sizeof(float));
  cudaMalloc(&d_c, MAXN * sizeof(float));
  cudaMemcpy(d_a, a, MAXN * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, MAXN * sizeof(float), cudaMemcpyHostToDevice);
  // ************** START GPU COMPUTATION **************
  start = std::chrono::high_resolution_clock::now();
-  // Implement your code here
+  int threadsPerBlock = 512;
  int blocksPerGrid = (MAXN + threadsPerBlock - 1) / threadsPerBlock;
  vectorAddGPU<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, MAXN);
  cudaDeviceSynchronize();
  end = std::chrono::high_resolution_clock::now();
  float *result = new float[MAXN];
  // Copy the result from GPU to CPU
  cudaMemcpy(result, d_c, MAXN * sizeof(float), cudaMemcpyDeviceToHost);
  if (compare(c, result, MAXN)) {
    std::chrono::duration<double> new_elapsed = end - start;
    printf("GPU time: %.3fs\n", new_elapsed.count());
    printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count());
  }
  // Free GPU memory
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  // Free CPU memory
  delete[] a;
  delete[] b;
  delete[] c;
  delete[] result;
  return 0;
 }