This commit is contained in:
2024-07-10 07:46:27 +08:00
parent 66430108a4
commit c440358d9e

View File

@ -39,7 +39,10 @@ bool compare(float *a, float *b, const int N) {
__global__ void vectorAddGPU(float *a, float *b, float *c, const int N) { __global__ void vectorAddGPU(float *a, float *b, float *c, const int N) {
// Implement your vector add kernel here // Implement your vector add kernel here
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
c[idx] = a[idx] + b[idx];
}
} }
int main() { int main() {
@ -58,17 +61,39 @@ int main() {
// ************** START GPU MEMORY ALLOCATION ************** // ************** START GPU MEMORY ALLOCATION **************
// Implement your code here // Implement your code here
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, MAXN * sizeof(float));
cudaMalloc(&d_b, MAXN * sizeof(float));
cudaMalloc(&d_c, MAXN * sizeof(float));
cudaMemcpy(d_a, a, MAXN * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, MAXN * sizeof(float), cudaMemcpyHostToDevice);
// ************** START GPU COMPUTATION ************** // ************** START GPU COMPUTATION **************
start = std::chrono::high_resolution_clock::now(); start = std::chrono::high_resolution_clock::now();
// Implement your code here int threadsPerBlock = 512;
int blocksPerGrid = (MAXN + threadsPerBlock - 1) / threadsPerBlock;
vectorAddGPU<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, MAXN);
cudaDeviceSynchronize();
end = std::chrono::high_resolution_clock::now(); end = std::chrono::high_resolution_clock::now();
float *result = new float[MAXN]; float *result = new float[MAXN];
// Copy the result from GPU to CPU // Copy the result from GPU to CPU
cudaMemcpy(result, d_c, MAXN * sizeof(float), cudaMemcpyDeviceToHost);
if (compare(c, result, MAXN)) { if (compare(c, result, MAXN)) {
std::chrono::duration<double> new_elapsed = end - start; std::chrono::duration<double> new_elapsed = end - start;
printf("GPU time: %.3fs\n", new_elapsed.count()); printf("GPU time: %.3fs\n", new_elapsed.count());
printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count()); printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count());
} }
// Free GPU memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Free CPU memory
delete[] a;
delete[] b;
delete[] c;
delete[] result;
return 0;
} }