From c440358d9e8d7fb027a98da3a5dc958e4365624a Mon Sep 17 00:00:00 2001 From: ZhuangYumin Date: Wed, 10 Jul 2024 07:46:27 +0800 Subject: [PATCH] basic --- csrc/basic.cu | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/csrc/basic.cu b/csrc/basic.cu index d91c2ce..f708859 100644 --- a/csrc/basic.cu +++ b/csrc/basic.cu @@ -39,7 +39,10 @@ bool compare(float *a, float *b, const int N) { __global__ void vectorAddGPU(float *a, float *b, float *c, const int N) { // Implement your vector add kernel here - + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N) { + c[idx] = a[idx] + b[idx]; + } } int main() { @@ -58,17 +61,39 @@ int main() { // ************** START GPU MEMORY ALLOCATION ************** // Implement your code here - + float *d_a, *d_b, *d_c; + cudaMalloc(&d_a, MAXN * sizeof(float)); + cudaMalloc(&d_b, MAXN * sizeof(float)); + cudaMalloc(&d_c, MAXN * sizeof(float)); + + cudaMemcpy(d_a, a, MAXN * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, b, MAXN * sizeof(float), cudaMemcpyHostToDevice); // ************** START GPU COMPUTATION ************** start = std::chrono::high_resolution_clock::now(); - // Implement your code here + int threadsPerBlock = 512; + int blocksPerGrid = (MAXN + threadsPerBlock - 1) / threadsPerBlock; + vectorAddGPU<<>>(d_a, d_b, d_c, MAXN); + cudaDeviceSynchronize(); end = std::chrono::high_resolution_clock::now(); float *result = new float[MAXN]; // Copy the result from GPU to CPU + cudaMemcpy(result, d_c, MAXN * sizeof(float), cudaMemcpyDeviceToHost); if (compare(c, result, MAXN)) { std::chrono::duration new_elapsed = end - start; printf("GPU time: %.3fs\n", new_elapsed.count()); printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count()); } + // Free GPU memory + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); + + // Free CPU memory + delete[] a; + delete[] b; + delete[] c; + delete[] result; + + return 0; } \ No newline at end of file