CC BY 4.0 (除特别声明或转载文章外)
如果这篇博客帮助到你,可以请我喝一杯咖啡~
结论
- 五个
shfl.sync.bfly.b32
和add
被换成了redux.sync.add.s32
- 几乎没卵用,也许在 ptx 到 sass 汇编的后端编译过程中自动被优化了?
很好,又水了一篇 blog
源代码
reduce_add.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/bin/bash
spack unload -a
spack load [email protected]
spack load [email protected]
spack find --loaded
cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c
nvidia-smi
nvcc -gencode=arch=compute_80,code=sm_80 -run reduce_add.cu
nvcc -gencode=arch=compute_80,code=sm_80 -run reduce_add.cu -DUSE_AMPERE_REDUCE
nvcc -gencode=arch=compute_80,code=sm_80 -ptx -o reduce_add.0.ptx reduce_add.cu
nvcc -gencode=arch=compute_80,code=sm_80 -ptx -o reduce_add.1.ptx reduce_add.cu -DUSE_AMPERE_REDUCE
reduce_add.log
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
-- linux-debian9-zen / [email protected] --------------------------------
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
-- linux-debian9-zen2 / [email protected] -------------------------------
[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
128 AMD EPYC 7542 32-Core Processor
Tue Mar 9 08:01:10 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02 Driver Version: 450.80.02 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 A100-SXM4-40GB Off | 00000000:0F:00.0 Off | 0 |
| N/A 36C P0 61W / 400W | 0MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 A100-SXM4-40GB Off | 00000000:15:00.0 Off | On |
| N/A 34C P0 47W / 400W | 0MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
| 2 A100-SXM4-40GB Off | 00000000:51:00.0 Off | On |
| N/A 31C P0 43W / 400W | 0MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
| 3 A100-SXM4-40GB Off | 00000000:54:00.0 Off | On |
| N/A 32C P0 46W / 400W | 0MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
| 4 A100-SXM4-40GB Off | 00000000:8D:00.0 Off | On |
| N/A 31C P0 46W / 400W | 0MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
| 5 A100-SXM4-40GB Off | 00000000:92:00.0 Off | On |
| N/A 30C P0 43W / 400W | 0MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
| 6 A100-SXM4-40GB Off | 00000000:D6:00.0 Off | On |
| N/A 30C P0 43W / 400W | 0MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
| 7 A100-SXM4-40GB Off | 00000000:DA:00.0 Off | On |
| N/A 32C P0 45W / 400W | 3MiB / 40537MiB | N/A Default |
| | | Enabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| MIG devices: |
+------------------+----------------------+-----------+-----------------------+
| GPU GI CI MIG | Memory-Usage | Vol| Shared |
| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG|
| | | ECC| |
|==================+======================+===========+=======================|
| 7 13 0 0 | 3MiB / 4864MiB | 14 0 | 1 0 0 0 0 |
| | 0MiB / 8191MiB | | |
+------------------+----------------------+-----------+-----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
wuk::host_reduce_add_strided_batched: 27.999231 ms, 3.834898e+10 FLOPS.
wuk::host_reduce_add_strided_batched: 28.010496 ms, 3.833355e+10 FLOPS.
reduce_add.cu
实验代码。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#include <cstdio>
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
namespace wuk
{
template <
typename Tcompute,
int WARPSIZE = 32,
typename Tmask = unsigned>
static __device__ __forceinline__ Tcompute
warp_allreduce_add(
Tcompute val,
const Tmask FINAL_MASK = 0xffffffff)
{
#if CUDA_VERSION < 9000
#pragma unroll
for (int offset = WARPSIZE >> 1; offset > 0; offset >>= 1)
val += __shfl_xor(val, FINAL_MASK, offset);
#else
#pragma unroll
for (int offset = WARPSIZE >> 1; offset > 0; offset >>= 1)
val += __shfl_xor_sync(FINAL_MASK, val, offset, WARPSIZE);
#endif
return val;
}
#ifdef USE_AMPERE_REDUCE
template <>
__device__ __forceinline__ int
warp_allreduce_add<
int,
32,
unsigned>(
int val,
const unsigned FINAL_MASK)
{
return __reduce_add_sync(FINAL_MASK, val);
}
template <>
__device__ __forceinline__ unsigned
warp_allreduce_add<
unsigned,
32,
unsigned>(
unsigned val,
const unsigned FINAL_MASK)
{
return __reduce_add_sync(FINAL_MASK, val);
}
#endif
template <
typename Tin,
typename Tcompute,
int WARPSIZE = 32>
static __device__ __forceinline__ Tcompute
thread_reduce_add(
int n,
Tin *x,
int incx,
int offset)
{
Tcompute val = (Tcompute)0;
for (int i = offset; i < n; i += incx)
val += (Tcompute)x[i];
return val;
}
template <
typename Tin,
typename Tout,
typename Tcompute,
int BLOCKSIZE,
int WARPSIZE = 32>
static __device__ __forceinline__ void
block_reduce_add(
int n,
Tin *x,
int incx,
Tout *result)
{
static __shared__ Tcompute smem[BLOCKSIZE / WARPSIZE];
{
const Tcompute val = warp_allreduce_add<
Tcompute,
WARPSIZE>(
thread_reduce_add<
Tin,
Tcompute>(
n,
x,
incx * BLOCKSIZE,
threadIdx.x));
if (threadIdx.x % WARPSIZE == 0)
smem[threadIdx.x / WARPSIZE] = val;
}
__syncthreads();
if (threadIdx.x < WARPSIZE)
{
const Tcompute val = warp_allreduce_add<
Tcompute,
WARPSIZE>(
thread_reduce_add<
Tcompute,
Tcompute>(
BLOCKSIZE / WARPSIZE,
smem,
WARPSIZE,
threadIdx.x));
if (threadIdx.x % WARPSIZE == 0)
result[threadIdx.x / WARPSIZE] = (Tout)val;
}
}
template <
typename Tin,
typename Tout,
typename Tcompute,
int BLOCKSIZE,
int WARPSIZE = 32>
static __global__ __launch_bounds__(BLOCKSIZE) void global_reduce_add_strided_batched(
int n, Tin *x, int incx,
int stride_x,
Tout *result)
{
block_reduce_add<
Tin,
Tout,
Tcompute,
BLOCKSIZE,
WARPSIZE>(
n,
x + stride_x * blockIdx.x,
incx,
result + blockIdx.x);
}
template <
typename Tin,
typename Tout,
typename Tcompute,
int WARPSIZE = 32>
void host_reduce_add_strided_batched(
int n,
Tin *x,
int incx,
int stride_x,
Tout *result,
int batch_count,
cudaStream_t stream)
{
const int BLOCKSIZE = 1024;
dim3 blockDim(BLOCKSIZE), gridDim(batch_count);
global_reduce_add_strided_batched<
Tin,
Tout,
Tcompute,
BLOCKSIZE,
WARPSIZE><<<
gridDim,
blockDim,
0,
stream>>>(
n,
x,
incx,
stride_x,
result);
}
} // namespace wuk
void WuK_Timer(
const char *tag,
float flo,
const std::function<void()> &kernel,
int test_time = 9)
{
float min_time = 9e99;
while (test_time--)
{
cudaEvent_t beg, end;
cudaEventCreate(&beg);
cudaEventCreate(&end);
cudaEventRecord(beg);
kernel();
cudaEventRecord(end);
cudaEventSynchronize(beg);
cudaEventSynchronize(end);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, beg, end);
min_time = std::min(min_time, elapsed_time);
}
std::printf("%s: %f ms, %e FLOPS.\n", tag, min_time, flo * 1e3 / min_time);
}
typedef int Tcompute;
typedef Tcompute Tin;
typedef Tcompute Tout;
const int64_t
n = 1 << 19,
incx = 1,
stride_x = n * incx,
batch_count = 1 << 11;
thrust::device_vector<Tin> x_vector(stride_x *batch_count, 1);
thrust::device_vector<Tout> result_vector(batch_count, 0);
cudaStream_t stream = NULL;
int main()
{
WuK_Timer(
"wuk::host_reduce_add_strided_batched",
1.0 * n * batch_count,
[&] {
wuk::host_reduce_add_strided_batched<
Tin,
Tout,
Tcompute>(
n,
thrust::raw_pointer_cast(x_vector.data()),
incx,
stride_x,
thrust::raw_pointer_cast(result_vector.data()),
batch_count,
stream);
});
}
reduce_add.0.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-28540450
// Cuda compilation tools, release 11.0, V11.0.194
// Based on LLVM 3.4svn
//
.version 7.0
.target sm_80
.address_size 64
// .globl _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1_
// _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem has been demoted
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust6system6detail10sequential3seqE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust6system3cpp3parE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust8cuda_cub3parE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_1E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_2E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_3E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_4E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_5E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_6E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_7E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_8E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_9E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders3_10E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust3seqE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b16thrust6deviceE[1];
.visible .entry _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1_(
.param .align 8 .b8 _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_0[16],
.param .u64 _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_1
)
.maxntid 256, 1, 1
{
.reg .pred %p<8>;
.reg .b32 %r<13>;
.reg .b64 %rd<36>;
ld.param.u64 %rd2, [_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_0];
ld.param.u32 %r1, [_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_0+8];
ld.param.u64 %rd9, [_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_1];
mov.u32 %r3, %ctaid.x;
mul.wide.u32 %rd10, %r3, 512;
sub.s64 %rd11, %rd9, %rd10;
mov.u64 %rd12, 512;
min.u64 %rd1, %rd11, %rd12;
setp.eq.s64 %p1, %rd1, 512;
mov.u32 %r2, %tid.x;
cvt.u64.u32 %rd3, %r2;
@%p1 bra BB0_7;
bra.uni BB0_1;
BB0_7:
add.s64 %rd28, %rd3, %rd10;
shl.b64 %rd29, %rd28, 2;
add.s64 %rd7, %rd2, %rd29;
setp.eq.s64 %p6, %rd7, 0;
@%p6 bra BB0_9;
cvta.to.global.u64 %rd30, %rd7;
st.global.u32 [%rd30], %r1;
BB0_9:
add.s32 %r11, %r2, 256;
cvt.u64.u32 %rd31, %r11;
add.s64 %rd33, %rd31, %rd10;
shl.b64 %rd34, %rd33, 2;
add.s64 %rd8, %rd2, %rd34;
setp.eq.s64 %p7, %rd8, 0;
@%p7 bra BB0_11;
cvta.to.global.u64 %rd35, %rd8;
st.global.u32 [%rd35], %r1;
bra.uni BB0_11;
BB0_1:
cvt.s64.s32 %rd4, %rd1;
setp.ge.u64 %p2, %rd3, %rd4;
@%p2 bra BB0_4;
add.s64 %rd14, %rd3, %rd10;
shl.b64 %rd15, %rd14, 2;
add.s64 %rd16, %rd2, %rd15;
setp.eq.s64 %p3, %rd16, 0;
@%p3 bra BB0_4;
cvta.to.global.u64 %rd22, %rd16;
st.global.u32 [%rd22], %r1;
BB0_4:
add.s32 %r8, %r2, 256;
cvt.u64.u32 %rd5, %r8;
setp.ge.u64 %p4, %rd5, %rd4;
@%p4 bra BB0_11;
add.s64 %rd24, %rd5, %rd10;
shl.b64 %rd25, %rd24, 2;
add.s64 %rd6, %rd2, %rd25;
setp.eq.s64 %p5, %rd6, 0;
@%p5 bra BB0_11;
cvta.to.global.u64 %rd26, %rd6;
st.global.u32 [%rd26], %r1;
BB0_11:
ret;
}
// .globl _ZN3cub11EmptyKernelIvEEvv
.visible .entry _ZN3cub11EmptyKernelIvEEvv(
)
{
ret;
}
.entry _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0_(
.param .u32 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_0,
.param .u64 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_1,
.param .u32 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_2,
.param .u32 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_3,
.param .u64 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_4
)
.maxntid 1024, 1, 1
{
.reg .pred %p<23>;
.reg .b32 %r<120>;
.reg .b64 %rd<15>;
// demoted variable
.shared .align 4 .b8 _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem[128];
ld.param.u32 %r35, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_0];
ld.param.u64 %rd4, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_1];
ld.param.u32 %r38, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_2];
ld.param.u32 %r39, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_3];
ld.param.u64 %rd3, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_4];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r1, %ctaid.x;
mul.lo.s32 %r40, %r1, %r39;
cvt.u64.u32 %rd2, %r40;
shl.b32 %r2, %r38, 10;
mov.u32 %r3, %tid.x;
mov.u32 %r109, 0;
setp.ge.s32 %p1, %r3, %r35;
@%p1 bra BB2_3;
mov.u32 %r107, %r3;
BB2_2:
cvt.s64.s32 %rd5, %r107;
add.s64 %rd6, %rd5, %rd2;
shl.b64 %rd7, %rd6, 2;
add.s64 %rd8, %rd1, %rd7;
ld.global.u32 %r41, [%rd8];
add.s32 %r109, %r41, %r109;
add.s32 %r107, %r107, %r2;
setp.lt.s32 %p2, %r107, %r35;
@%p2 bra BB2_2;
BB2_3:
mov.u32 %r42, 31;
mov.u32 %r43, 16;
mov.u32 %r44, -1;
shfl.sync.bfly.b32 %r45|%p3, %r109, %r43, %r42, %r44;
add.s32 %r46, %r45, %r109;
mov.u32 %r47, 8;
shfl.sync.bfly.b32 %r48|%p4, %r46, %r47, %r42, %r44;
add.s32 %r49, %r48, %r46;
mov.u32 %r50, 4;
shfl.sync.bfly.b32 %r51|%p5, %r49, %r50, %r42, %r44;
add.s32 %r52, %r51, %r49;
mov.u32 %r53, 2;
shfl.sync.bfly.b32 %r54|%p6, %r52, %r53, %r42, %r44;
add.s32 %r55, %r54, %r52;
mov.u32 %r56, 1;
shfl.sync.bfly.b32 %r57|%p7, %r55, %r56, %r42, %r44;
add.s32 %r9, %r57, %r55;
and.b32 %r10, %r3, 31;
setp.ne.s32 %p8, %r10, 0;
@%p8 bra BB2_5;
shr.u32 %r58, %r3, 3;
and.b32 %r59, %r58, 536870908;
mov.u32 %r60, _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r61, %r60, %r59;
st.shared.u32 [%r61], %r9;
BB2_5:
bar.sync 0;
setp.gt.u32 %p9, %r3, 31;
@%p9 bra BB2_18;
mov.u32 %r119, 0;
setp.gt.s32 %p10, %r3, 31;
@%p10 bra BB2_16;
setp.gt.s32 %p11, %r3, 0;
mov.u32 %r119, 0;
add.s32 %r64, %r3, 31;
selp.b32 %r65, %r64, 31, %p11;
sub.s32 %r66, %r65, %r3;
shr.u32 %r67, %r66, 5;
add.s32 %r11, %r67, 1;
and.b32 %r12, %r11, 3;
setp.eq.s32 %p12, %r12, 0;
mov.u32 %r114, %r3;
@%p12 bra BB2_13;
setp.eq.s32 %p13, %r12, 1;
mov.u32 %r113, 0;
mov.u32 %r112, %r3;
@%p13 bra BB2_12;
setp.eq.s32 %p14, %r12, 2;
mov.u32 %r111, 0;
mov.u32 %r110, %r3;
@%p14 bra BB2_11;
shl.b32 %r70, %r3, 2;
mov.u32 %r71, _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r72, %r71, %r70;
ld.shared.u32 %r111, [%r72];
add.s32 %r110, %r3, 32;
BB2_11:
shl.b32 %r73, %r110, 2;
mov.u32 %r74, _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r75, %r74, %r73;
ld.shared.u32 %r76, [%r75];
add.s32 %r113, %r76, %r111;
add.s32 %r112, %r110, 32;
BB2_12:
shl.b32 %r77, %r112, 2;
mov.u32 %r78, _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r79, %r78, %r77;
ld.shared.u32 %r80, [%r79];
add.s32 %r119, %r80, %r113;
add.s32 %r114, %r112, 32;
BB2_13:
setp.lt.u32 %p15, %r11, 4;
@%p15 bra BB2_16;
add.s32 %r117, %r114, -32;
shl.b32 %r81, %r114, 2;
mov.u32 %r82, _ZZN58_INTERNAL_36_tmpxft_000434ec_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r116, %r82, %r81;
BB2_15:
ld.shared.u32 %r83, [%r116];
add.s32 %r84, %r83, %r119;
ld.shared.u32 %r85, [%r116+128];
add.s32 %r86, %r85, %r84;
ld.shared.u32 %r87, [%r116+256];
add.s32 %r88, %r87, %r86;
ld.shared.u32 %r89, [%r116+384];
add.s32 %r119, %r89, %r88;
add.s32 %r116, %r116, 512;
add.s32 %r117, %r117, 128;
setp.lt.s32 %p16, %r117, 0;
@%p16 bra BB2_15;
BB2_16:
shfl.sync.bfly.b32 %r93|%p17, %r119, %r43, %r42, %r44;
add.s32 %r94, %r93, %r119;
shfl.sync.bfly.b32 %r96|%p18, %r94, %r47, %r42, %r44;
add.s32 %r97, %r96, %r94;
shfl.sync.bfly.b32 %r99|%p19, %r97, %r50, %r42, %r44;
add.s32 %r100, %r99, %r97;
shfl.sync.bfly.b32 %r102|%p20, %r100, %r53, %r42, %r44;
add.s32 %r103, %r102, %r100;
shfl.sync.bfly.b32 %r105|%p21, %r103, %r56, %r42, %r44;
add.s32 %r34, %r105, %r103;
@%p8 bra BB2_18;
shr.u32 %r106, %r3, 5;
cvt.u64.u32 %rd9, %r106;
cvt.u64.u32 %rd10, %r1;
add.s64 %rd11, %rd9, %rd10;
cvta.to.global.u64 %rd12, %rd3;
shl.b64 %rd13, %rd11, 2;
add.s64 %rd14, %rd12, %rd13;
st.global.u32 [%rd14], %r34;
BB2_18:
ret;
}
reduce_add.1.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-28540450
// Cuda compilation tools, release 11.0, V11.0.194
// Based on LLVM 3.4svn
//
.version 7.0
.target sm_80
.address_size 64
// .globl _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1_
// _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem has been demoted
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust6system6detail10sequential3seqE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust6system3cpp3parE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust8cuda_cub3parE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_1E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_2E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_3E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_4E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_5E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_6E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_7E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_8E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders2_9E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust12placeholders3_10E[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust3seqE[1];
.global .align 1 .b8 _ZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b16thrust6deviceE[1];
.visible .entry _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1_(
.param .align 8 .b8 _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_0[16],
.param .u64 _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_1
)
.maxntid 256, 1, 1
{
.reg .pred %p<8>;
.reg .b32 %r<13>;
.reg .b64 %rd<36>;
ld.param.u64 %rd2, [_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_0];
ld.param.u32 %r1, [_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_0+8];
ld.param.u64 %rd9, [_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_20__uninitialized_fill7functorINS_10device_ptrIiEEiEEmEES9_mEEvT0_T1__param_1];
mov.u32 %r3, %ctaid.x;
mul.wide.u32 %rd10, %r3, 512;
sub.s64 %rd11, %rd9, %rd10;
mov.u64 %rd12, 512;
min.u64 %rd1, %rd11, %rd12;
setp.eq.s64 %p1, %rd1, 512;
mov.u32 %r2, %tid.x;
cvt.u64.u32 %rd3, %r2;
@%p1 bra BB0_7;
bra.uni BB0_1;
BB0_7:
add.s64 %rd28, %rd3, %rd10;
shl.b64 %rd29, %rd28, 2;
add.s64 %rd7, %rd2, %rd29;
setp.eq.s64 %p6, %rd7, 0;
@%p6 bra BB0_9;
cvta.to.global.u64 %rd30, %rd7;
st.global.u32 [%rd30], %r1;
BB0_9:
add.s32 %r11, %r2, 256;
cvt.u64.u32 %rd31, %r11;
add.s64 %rd33, %rd31, %rd10;
shl.b64 %rd34, %rd33, 2;
add.s64 %rd8, %rd2, %rd34;
setp.eq.s64 %p7, %rd8, 0;
@%p7 bra BB0_11;
cvta.to.global.u64 %rd35, %rd8;
st.global.u32 [%rd35], %r1;
bra.uni BB0_11;
BB0_1:
cvt.s64.s32 %rd4, %rd1;
setp.ge.u64 %p2, %rd3, %rd4;
@%p2 bra BB0_4;
add.s64 %rd14, %rd3, %rd10;
shl.b64 %rd15, %rd14, 2;
add.s64 %rd16, %rd2, %rd15;
setp.eq.s64 %p3, %rd16, 0;
@%p3 bra BB0_4;
cvta.to.global.u64 %rd22, %rd16;
st.global.u32 [%rd22], %r1;
BB0_4:
add.s32 %r8, %r2, 256;
cvt.u64.u32 %rd5, %r8;
setp.ge.u64 %p4, %rd5, %rd4;
@%p4 bra BB0_11;
add.s64 %rd24, %rd5, %rd10;
shl.b64 %rd25, %rd24, 2;
add.s64 %rd6, %rd2, %rd25;
setp.eq.s64 %p5, %rd6, 0;
@%p5 bra BB0_11;
cvta.to.global.u64 %rd26, %rd6;
st.global.u32 [%rd26], %r1;
BB0_11:
ret;
}
// .globl _ZN3cub11EmptyKernelIvEEvv
.visible .entry _ZN3cub11EmptyKernelIvEEvv(
)
{
ret;
}
.entry _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0_(
.param .u32 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_0,
.param .u64 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_1,
.param .u32 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_2,
.param .u32 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_3,
.param .u64 _ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_4
)
.maxntid 1024, 1, 1
{
.reg .pred %p<13>;
.reg .b32 %r<90>;
.reg .b64 %rd<15>;
// demoted variable
.shared .align 4 .b8 _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem[128];
ld.param.u32 %r35, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_0];
ld.param.u64 %rd4, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_1];
ld.param.u32 %r38, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_2];
ld.param.u32 %r39, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_3];
ld.param.u64 %rd3, [_ZN3wuk33global_reduce_add_strided_batchedIiiiLi1024ELi32EEEviPT_iiPT0__param_4];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r1, %ctaid.x;
mul.lo.s32 %r40, %r1, %r39;
cvt.u64.u32 %rd2, %r40;
shl.b32 %r2, %r38, 10;
mov.u32 %r3, %tid.x;
mov.u32 %r79, 0;
setp.ge.s32 %p1, %r3, %r35;
@%p1 bra BB2_3;
mov.u32 %r77, %r3;
BB2_2:
cvt.s64.s32 %rd5, %r77;
add.s64 %rd6, %rd5, %rd2;
shl.b64 %rd7, %rd6, 2;
add.s64 %rd8, %rd1, %rd7;
ld.global.u32 %r41, [%rd8];
add.s32 %r79, %r41, %r79;
add.s32 %r77, %r77, %r2;
setp.lt.s32 %p2, %r77, %r35;
@%p2 bra BB2_2;
BB2_3:
mov.u32 %r42, -1;
redux.sync.add.s32 %r9, %r79, %r42;
and.b32 %r10, %r3, 31;
setp.ne.s32 %p3, %r10, 0;
@%p3 bra BB2_5;
shr.u32 %r43, %r3, 3;
and.b32 %r44, %r43, 536870908;
mov.u32 %r45, _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r46, %r45, %r44;
st.shared.u32 [%r46], %r9;
BB2_5:
bar.sync 0;
setp.gt.u32 %p4, %r3, 31;
@%p4 bra BB2_18;
mov.u32 %r89, 0;
setp.gt.s32 %p5, %r3, 31;
@%p5 bra BB2_16;
setp.gt.s32 %p6, %r3, 0;
mov.u32 %r89, 0;
add.s32 %r49, %r3, 31;
selp.b32 %r50, %r49, 31, %p6;
sub.s32 %r51, %r50, %r3;
shr.u32 %r52, %r51, 5;
add.s32 %r11, %r52, 1;
and.b32 %r12, %r11, 3;
setp.eq.s32 %p7, %r12, 0;
mov.u32 %r84, %r3;
@%p7 bra BB2_13;
setp.eq.s32 %p8, %r12, 1;
mov.u32 %r83, 0;
mov.u32 %r82, %r3;
@%p8 bra BB2_12;
setp.eq.s32 %p9, %r12, 2;
mov.u32 %r81, 0;
mov.u32 %r80, %r3;
@%p9 bra BB2_11;
shl.b32 %r55, %r3, 2;
mov.u32 %r56, _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r57, %r56, %r55;
ld.shared.u32 %r81, [%r57];
add.s32 %r80, %r3, 32;
BB2_11:
shl.b32 %r58, %r80, 2;
mov.u32 %r59, _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r60, %r59, %r58;
ld.shared.u32 %r61, [%r60];
add.s32 %r83, %r61, %r81;
add.s32 %r82, %r80, 32;
BB2_12:
shl.b32 %r62, %r82, 2;
mov.u32 %r63, _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r64, %r63, %r62;
ld.shared.u32 %r65, [%r64];
add.s32 %r89, %r65, %r83;
add.s32 %r84, %r82, 32;
BB2_13:
setp.lt.u32 %p10, %r11, 4;
@%p10 bra BB2_16;
add.s32 %r87, %r84, -32;
shl.b32 %r66, %r84, 2;
mov.u32 %r67, _ZZN58_INTERNAL_36_tmpxft_00043bfa_00000000_7_a_cpp1_ii_1e97e9b13wuk16block_reduce_addIiiiLi1024ELi32EEEviPT_iPT0_E4smem;
add.s32 %r86, %r67, %r66;
BB2_15:
ld.shared.u32 %r68, [%r86];
add.s32 %r69, %r68, %r89;
ld.shared.u32 %r70, [%r86+128];
add.s32 %r71, %r70, %r69;
ld.shared.u32 %r72, [%r86+256];
add.s32 %r73, %r72, %r71;
ld.shared.u32 %r74, [%r86+384];
add.s32 %r89, %r74, %r73;
add.s32 %r86, %r86, 512;
add.s32 %r87, %r87, 128;
setp.lt.s32 %p11, %r87, 0;
@%p11 bra BB2_15;
BB2_16:
redux.sync.add.s32 %r34, %r89, %r42;
@%p3 bra BB2_18;
shr.u32 %r76, %r3, 5;
cvt.u64.u32 %rd9, %r76;
cvt.u64.u32 %rd10, %r1;
add.s64 %rd11, %rd9, %rd10;
cvta.to.global.u64 %rd12, %rd3;
shl.b64 %rd13, %rd11, 2;
add.s64 %rd14, %rd12, %rd13;
st.global.u32 [%rd14], %r34;
BB2_18:
ret;
}