basic
This commit is contained in:
@ -39,7 +39,10 @@ bool compare(float *a, float *b, const int N) {
|
||||
|
||||
__global__ void vectorAddGPU(float *a, float *b, float *c, const int N) {
|
||||
// Implement your vector add kernel here
|
||||
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < N) {
|
||||
c[idx] = a[idx] + b[idx];
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
@ -58,17 +61,39 @@ int main() {
|
||||
|
||||
// ************** START GPU MEMORY ALLOCATION **************
|
||||
// Implement your code here
|
||||
|
||||
float *d_a, *d_b, *d_c;
|
||||
cudaMalloc(&d_a, MAXN * sizeof(float));
|
||||
cudaMalloc(&d_b, MAXN * sizeof(float));
|
||||
cudaMalloc(&d_c, MAXN * sizeof(float));
|
||||
|
||||
cudaMemcpy(d_a, a, MAXN * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_b, b, MAXN * sizeof(float), cudaMemcpyHostToDevice);
|
||||
// ************** START GPU COMPUTATION **************
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
// Implement your code here
|
||||
int threadsPerBlock = 512;
|
||||
int blocksPerGrid = (MAXN + threadsPerBlock - 1) / threadsPerBlock;
|
||||
vectorAddGPU<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, MAXN);
|
||||
cudaDeviceSynchronize();
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
float *result = new float[MAXN];
|
||||
// Copy the result from GPU to CPU
|
||||
cudaMemcpy(result, d_c, MAXN * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
if (compare(c, result, MAXN)) {
|
||||
std::chrono::duration<double> new_elapsed = end - start;
|
||||
printf("GPU time: %.3fs\n", new_elapsed.count());
|
||||
printf("Speedup: %.2fx\n", elapsed.count() / new_elapsed.count());
|
||||
}
|
||||
// Free GPU memory
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_c);
|
||||
|
||||
// Free CPU memory
|
||||
delete[] a;
|
||||
delete[] b;
|
||||
delete[] c;
|
||||
delete[] result;
|
||||
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user