Menu

并行与分布式计算(6)

CUDA-homework-1

Start from the provided skeleton code error-test.cu that provides some convenience macros for error checking. The macros are defined in the header file error_checks_1.h. Add the missing memory allocations and copies and the kernel launch and check that your code works.

Remember that you can use also cuda-memcheck! If you have time, you can also check what happens if you remove all error checks and do the same tests again.

What happens if you try to launch kernel with too large block size? When do you catch the error if you remove the cudaStreamSynchronize(NULL) call

过大 block size 会抢占计算资源。在 easyHPC 上面交了(vector_add<<<1, 1025>>>(dC, dA, dB, N);)之后,出现了这样的报错

1
2
Error: vector_add kernel at 0_4323.cu(86): invalid configuration argument
yhrun: error: gn07: task 1: Exited with exit code 1

删去设备同步的调用之后,报错信息如下。

1
2
Error: vector_add kernel at 0_4323.cu(86): invalid configuration argument
yhrun: error: gn07: task 0: Exited with exit code 1

What happens if you try to dereference a pointer to device memory in host code

运行时在解引用的地方报 error 了。

1
2
Error: vector_add kernel at 0_4323.cu(85): invalid configuration argument
yhrun: error: gn07: task 0: Exited with exit code 1

What if you try to access host memory from the kernel

编译时报错。

1
2
3
4
5
6
7
0_4323.cu(48): error: identifier "hb" is undefined in device code

1 error detected in the compilation of "/tmp/tmpxft_0000440e_00000000-11_0_4323.cpp2.i".

slurmd[gn07]: execve(): 0_4323.cu.out: No such file or directory
yhrun: error: gn07: task 0: Exited with exit code 2
rm: cannot remove ‘0_4323.cu.out’: No such file or directory

提交代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include <stdio.h>
#include <math.h>
//#include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
// This header provides two helper macros for error checking
// See the exercise skeletons and answers for usage examples.

#ifndef COURSE_UTIL_H_
#define COURSE_UTIL_H_

#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__)
#define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)

inline void __checkErrorFunc(cudaError_t errarg, const char *file,
							 const int line)
{
	if (errarg)
	{
		fprintf(stderr, "Error at %s(%i)\n", file, line);
		exit(EXIT_FAILURE);
	}
}

inline void __checkErrMsgFunc(const char *errstr, const char *file,
							  const int line)
{
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Error: %s at %s(%i): %s\n",
				errstr, file, line, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}

#endif

__global__ void vector_add(double *C, const double *A, const double *B, int N)
{
	// Add the kernel code
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	// Do not try to access past the allocated memory
	if (idx < N)
	{
		C[idx] = A[idx] + B[idx];
	}
}

int main(void)
{
	const int N = 20;
	const int ThreadsInBlock = 128;
	double *dA, *dB, *dC;
	double hA[N], hB[N], hC[N];

	for (int i = 0; i < N; ++i)
	{
		hA[i] = (double)i;
		hB[i] = (double)i * i;
	}

	/*
       Add memory allocations and copies. Wrap your runtime function
       calls with CUDA_CHECK( ) macro
    */
	CUDA_CHECK(cudaMalloc((void **)&dA, sizeof(double) * N));
	CUDA_CHECK(cudaMalloc((void **)&dB, sizeof(double) * N));
	CUDA_CHECK(cudaMalloc((void **)&dC, sizeof(double) * N));
	CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double) * N, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double) * N, cudaMemcpyHostToDevice));
	//#error Add the remaining memory allocations and copies

	// Note the maximum size of threads in a block
	dim3 grid, threads;

	//// Add the kernel call here
	vector_add<<<1, 32>>>(dC, dA, dB, N);
	//#error Add the CUDA kernel call

	// Here we add an explicit synchronization so that we catch errors
	// as early as possible. Don't do this in production code!
	cudaStreamSynchronize(NULL);
	CHECK_ERROR_MSG("vector_add kernel");

	//// Copy back the results and free the device memory
	CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double) * N, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaFree(dA));
	CUDA_CHECK(cudaFree(dB));
	CUDA_CHECK(cudaFree(dC));
	//#error Copy back the results and free the allocated memory

	for (int i = 0; i < N; i++)
		printf("%5.1f\n", hC[i]);

	return 0;
}

CUDA-homework-2

In this exercise we will implement a Jacobi iteration which is a very simple finite-difference scheme. Familiarize yourself with the provided skeleton. Then implement following things:

Write the missing CUDA kernel sweepGPU that implements the same algorithm as the sweepCPU function. Check that the reported averate difference is in the order of the numerical accuracy.

Experiment with different grid and block sizes and compare the execution times.

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
//err_checker.h
// This header provides two helper macros for error checking
// See the exercise skeletons and answers for usage examples.

#ifndef COURSE_UTIL_H_
#define COURSE_UTIL_H_

#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__)
#define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)

inline void __checkErrorFunc(cudaError_t errarg, const char *file,
							 const int line)
{
	if (errarg)
	{
		fprintf(stderr, "Error at %s(%i)\n", file, line);
		exit(EXIT_FAILURE);
	}
}

inline void __checkErrMsgFunc(const char *errstr, const char *file,
							  const int line)
{
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Error: %s at %s(%i): %s\n",
				errstr, file, line, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}

#endif
//jacbi.h
#ifndef EX3_H_
#define EX3_H_

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/zip_iterator.h>

// Helper function prototypes
double compareArrays(const double *a, const double *b, int N);
double diffCPU(const double *a, const double *b, int N);
void sweepCPU(double *phi, const double *phiPrev,
			  const double *source, double h2, int N);

/* -------------------------------------------------------------------------
   EXTRACURRICULAR ACTIVITIES

   This part provides the reduction operation (in this case summation of
   difference of two arrays) using thrust library. Thrust mimics the
   syntax and design of standard template library (STL) of C++. Thrust is
   also a part of CUDA 4 SDK.
   More information can be found from thrust home page
   http://code.google.com/p/thrust/
   ----------------------------------------------------------------------- */

template <typename T>
class square_diff_thr : public thrust::unary_function<thrust::tuple<T, T>, T>
{
public:
	__host__ __device__
		T
		operator()(const thrust::tuple<T, T> &x) const
	{
		return (thrust::get<1>(x) - thrust::get<0>(x)) *
			   (thrust::get<1>(x) - thrust::get<0>(x));
	}
};

template <typename T>
class square_thr : public thrust::unary_function<T, T>
{
public:
	__host__ __device__
		T
		operator()(const T &x) const
	{
		return x * x;
	}
};

template <typename T>
T diffGPU(T *A_d, T *B_d, int N)
{
	typedef thrust::device_ptr<T> FloatIterator;
	typedef thrust::tuple<FloatIterator, FloatIterator> IteratorTuple;
	typedef thrust::zip_iterator<IteratorTuple> ZipIterator;

	thrust::device_ptr<T> A_ptr(A_d);
	thrust::device_ptr<T> B_ptr(B_d);

	ZipIterator first =
		thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr));
	ZipIterator last =
		thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N * N,
													 B_ptr + N * N));

	T a1 = thrust::transform_reduce(first, last, square_diff_thr<T>(),
									static_cast<T>(0), thrust::plus<T>());
	T a2 = thrust::transform_reduce(B_ptr, B_ptr + N * N,
									square_thr<T>(), static_cast<T>(0),
									thrust::plus<T>());

	return sqrt(a1 / a2);
}

#endif // EX3_H_

//jacobi.cu
#include <time.h>
#include <stdio.h>
//#include "jacobi.h"
//#include "error_checks.h"

// Change this to 0 if CPU reference result is not needed
#define COMPUTE_CPU_REFERENCE 1
#define MAX_ITERATIONS 3000

// CPU kernel
void sweepCPU(double *phi, const double *phiPrev, const double *source,
			  double h2, int N)
{
	int i, j;
	int index, i1, i2, i3, i4;

	for (j = 1; j < N - 1; j++)
	{
		for (i = 1; i < N - 1; i++)
		{
			index = i + j * N;
			i1 = (i - 1) + j * N;
			i2 = (i + 1) + j * N;
			i3 = i + (j - 1) * N;
			i4 = i + (j + 1) * N;
			phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] +
								 phiPrev[i3] + phiPrev[i4] -
								 h2 * source[index]);
		}
	}
}

// GPU kernel
__global__ void sweepGPU(double *phi, const double *phiPrev, const double *source, double h2, int N)
{
	// #error Add here the GPU version of the update routine (see sweepCPU above)
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;
	if (i > 0 && j > 0 && i < N - 1 && j < N - 1)
	{ // be careful!
		int index = i + j * N;
		int i1 = (i - 1) + j * N;
		int i2 = (i + 1) + j * N;
		int i3 = i + (j - 1) * N;
		int i4 = i + (j + 1) * N;
		phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + phiPrev[i3] + phiPrev[i4] - h2 * source[index]);
	}
}

double compareArrays(const double *a, const double *b, int N)
{
	double error = 0.0;
	int i;
	for (i = 0; i < N * N; i++)
	{
		error += fabs(a[i] - b[i]);
	}
	return error / (N * N);
}

double diffCPU(const double *phi, const double *phiPrev, int N)
{
	int i;
	double sum = 0;
	double diffsum = 0;

	for (i = 0; i < N * N; i++)
	{
		diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
		sum += phi[i] * phi[i];
	}

	return sqrt(diffsum / sum);
}

int main()
{
	clock_t t1, t2; // Structs for timing
	const int N = 512;
	double h = 1.0 / (N - 1);
	int iterations;
	const double tolerance = 5e-4; // Stopping condition
	int i, j, index;

	const int blocksize = 16;

	double *phi = new double[N * N];
	double *phiPrev = new double[N * N];
	double *source = new double[N * N];
	double *phi_cuda = new double[N * N];

	double *phi_d, *phiPrev_d, *source_d;
	// Size of the arrays in bytes
	const int size = N * N * sizeof(double);
	double diff;

	// Source initialization
	for (i = 0; i < N; i++)
	{
		for (j = 0; j < N; j++)
		{
			double x, y;
			x = (i - N / 2) * h;
			y = (j - N / 2) * h;
			index = j + i * N;
			if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
				source[index] = 1e10 * h * h;
			else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
				source[index] = -1e10 * h * h;
			else
				source[index] = 0.0;
		}
	}

	CUDA_CHECK(cudaMalloc((void **)&source_d, size));
	CUDA_CHECK(cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice));

	// Reset values to zero
	for (i = 0; i < N; i++)
	{
		for (j = 0; j < N; j++)
		{
			index = j + i * N;
			phi[index] = 0.0;
			phiPrev[index] = 0.0;
		}
	}

	CUDA_CHECK(cudaMalloc((void **)&phi_d, size));
	CUDA_CHECK(cudaMalloc((void **)&phiPrev_d, size));
	CUDA_CHECK(cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice));

	// CPU version
	if (COMPUTE_CPU_REFERENCE)
	{
		t1 = clock();

		// Do sweeps untill difference is under the tolerance
		diff = tolerance * 2;
		iterations = 0;
		while (diff > tolerance && iterations < MAX_ITERATIONS)
		{
			sweepCPU(phiPrev, phi, source, h * h, N);
			sweepCPU(phi, phiPrev, source, h * h, N);

			iterations += 2;
			if (iterations % 100 == 0)
			{
				diff = diffCPU(phi, phiPrev, N);
				printf("%d %g\n", iterations, diff);
			}
		}
		t2 = clock();
		printf("CPU Jacobi: %g ms, %d iterations\n",
			   t2 - t1,
			   iterations);
	}

	// GPU version

	dim3 dimBlock(blocksize, blocksize);
	dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize);

	//do sweeps until diff under tolerance
	diff = tolerance * 2;
	iterations = 0;

	t1 = clock();

	while (diff > tolerance && iterations < MAX_ITERATIONS)
	{
		// See above how the CPU update kernel is called
		// and implement similar calling sequence for the GPU code

		//// Add routines here
		sweepGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, source_d, h * h, N);
		sweepGPU<<<dimGrid, dimBlock>>>(phi_d, phiPrev_d, source_d, h * h, N);
		//#error Add GPU kernel calls here (see CPU version above)

		iterations += 2;

		if (iterations % 100 == 0)
		{
			// diffGPU is defined in the header file, it uses
			// Thrust library for reduction computation
			diff = diffGPU<double>(phiPrev_d, phi_d, N);
			CHECK_ERROR_MSG("Difference computation");
			printf("%d %g\n", iterations, diff);
		}
	}

	//// Add here the routine to copy back the results
	CUDA_CHECK(cudaMemcpy(phi, phi_d, size, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaMemcpy(phiPrev, phiPrev_d, size, cudaMemcpyDeviceToHost));
	//#error Copy back the results

	t2 = clock();
	printf("GPU Jacobi: %g ms, %d iterations\n",
		   t2 - t1,
		   iterations);

	//// Add here the clean up code for all allocated CUDA resources
	CUDA_CHECK(cudaFree(phi_d));
	CUDA_CHECK(cudaFree(phiPrev_d));
	CUDA_CHECK(cudaFree(source_d));
	//#error Add here the clean up code

	if (COMPUTE_CPU_REFERENCE)
	{
		printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
	}

	delete[] phi;
	delete[] phi_cuda;
	delete[] phiPrev;
	delete[] source;

	return EXIT_SUCCESS;
}