CC BY 4.0 (除特别声明或转载文章外)
如果这篇博客帮助到你,可以请我喝一杯咖啡~
给出以下 CUDA 程序(点击参考代码获取),补充完整代码。 需手动切换编程语言为 CUDA。 输出:
1
2
3
4
5
6
0.000000 0.000000 0.000000
0.100000 -0.100000 -0.010000
0.200000 -0.200000 -0.040000
0.300000 -0.300000 -0.090000
0.400000 -0.400000 -0.160000
0.500000 -0.500000 -0.250000
心中毫无波澜…
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
__global__ void multiply(float *A, float *B, float *C, const int N)
{
//Your code here.
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x)
C[i] = A[i] * B[i];
//End of your code.
}
int main()
{
int nElem = 6;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
memset(h_A, 0, nBytes);
memset(h_B, 0, nBytes);
memset(h_C, 0, nBytes);
for (int i = 0; i < nElem; i++)
{
h_A[i] = i * 0.1;
h_B[i] = -i * 0.1;
}
float *d_A, *d_B, *d_C;
cudaMalloc((float **)&d_A, nBytes);
cudaMalloc((float **)&d_B, nBytes);
cudaMalloc((float **)&d_C, nBytes);
cudaMemset(d_A, 0, nBytes);
cudaMemset(d_B, 0, nBytes);
cudaMemset(d_C, 0, nBytes);
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
multiply<<<2, 3>>>(d_A, d_B, d_C, nElem);
// Your code here.
cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);
// End of your code.
for (int i = 0; i < nElem; i++)
printf("%f\t%f\t%f\n", h_A[i], h_B[i], h_C[i]);
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}