Question:
Hello! Not long ago I got acquainted with CUDA, I am writing a small program to compare the speed on the GPU and CPU. The kernel must sum all elements, but it only sums the last element. What is the problem? Maybe sync is needed?
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <ctime>
#include <iostream>
using namespace std;
__global__ void add(int *matr, int N, int *sum)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//if (idx<N) sum+= a[idx];
if (idx<N) *sum += matr[idx];
}
int main(void)
{
srand(time(NULL));
int *matr_d;
int *matr_h;
const int N = 10;
int *sum_d = 0;
int *sum_h = 0;
matr_h = (int*)malloc(N * sizeof(int));
cudaMalloc((void**)&matr_d, N * sizeof(int));
for (int i = 0; i < N; i++)
cout << (matr_h[i] = rand() % 10) << " ";
cout << endl;
cudaMalloc(&sum_d, sizeof(int));
sum_h = new int[1];
sum_h[0] = 0;
cudaMemcpy(matr_d, matr_h, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(sum_d, sum_h, sizeof(int), cudaMemcpyHostToDevice);
// int block_size = 4;
// int n_blocks = N / block_size + 1;
add << <1, 10 >> > (matr_d, N, sum_d);
cudaMemcpy(sum_h, sum_d, sizeof(int), cudaMemcpyDeviceToHost);
cout << "sum: " << sum_h[0] << endl;
free(matr_h);
cudaFree(matr_d);
system("pause");
}
Answer:
The problem is that you have if (idx<N) *sum += matr[idx];
is executed for each element of the matr array, as a result, the values \u200b\u200bof each idx are written to sum, and you are lucky that it contains the value for the last element.