Nicolás Wolovick, 20200608
ma4_error prone.cu
) 1 #include <cuda.h>
2
3 #define N (1<<28)
4 #define BLOCK_SIZE 128
5
6 __device__ float a[N], b[N], c[N], d[N];
7
8 __global__ void ma4(void) {
9 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
10 d[gtid] = a[gtid]*b[gtid]+c[gtid];
11 }
12
13 int main(void) {
14 ma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
15 cudaDeviceSynchronize();
16 return 0;
17 }
De 10 líneas de código (4 realmente útiles) y muchos errores.
Usaremos las herramientas disponibles y plantearemos buenas prácticas.
checkCudaErrors()
en CADA llamado de la biblioteca CUDA.getLastCudaError("info")
.nvprof ./a.out
nvidia-smi
.Calcular "algo" no es calcular bien.
Versión extremadamente light de lo que CS conoce como corrección.
Resultado constante a pesar de:
CUDA_VISIBLE_DEVICES=1 ./a.out
.hashcat
).ma4.cu
Kernels
1 #include <cuda.h>
2 #include <stdio.h>
3
4 #include "helper_cuda.h"
5
6 #define N (1<<28)
7 #define BLOCK_SIZE 128
8
9 __device__ float a[N], b[N], c[N], d[N];
10 float ha[N], hb[N], hc[N], hd[N];
11
12 __global__ void set(void) {
13 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
14 a[gtid] = (float)blockIdx.x;
15 b[gtid] = (float)threadIdx.x;
16 c[gtid] = (float)threadIdx.x+blockIdx.x;
17 }
18
19 __global__ void ma4(void) {
20 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
21 d[gtid] = a[gtid]*b[gtid]+c[gtid];
22 }
ma4.cu
Host code
1 int main(void) {
2 set<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
3 getLastCudaError("set() kernel failed");
4 ma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
5 getLastCudaError("ma4() kernel failed");
6 checkCudaErrors(cudaDeviceSynchronize());
7 checkCudaErrors(cudaMemcpyFromSymbol(ha, a, N*sizeof(float)));
8 checkCudaErrors(cudaMemcpyFromSymbol(hb, b, N*sizeof(float)));
9 checkCudaErrors(cudaMemcpyFromSymbol(hc, c, N*sizeof(float)));
10 checkCudaErrors(cudaMemcpyFromSymbol(hd, d, N*sizeof(float)));
11 for (size_t i=0; i<N; ++i)
12 if (hd[i] != ha[i]*hb[i]+hc[i]) {
13 printf("%d, %f!=%f*%f+%f\n", i, hd[i], ha[i], hb[i], hc[i]);
14 break;
15 }
16
17 return 0;
18 }
cuda-gdb
1 $ nvcc -g -arch=sm_61 --compiler-options "-O3 -mcmodel=medium" ma4.cu
2 $ cuda-gdb ./a.out
3 NVIDIA (R) CUDA Debugger
4 10.1 release
5 GNU gdb (GDB) 7.12
6 (cuda-gdb) l
7 12 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
8 13 a[gtid] = (float)blockIdx.x;
9 14 b[gtid] = (float)threadIdx.x;
10 15 c[gtid] = (float)threadIdx.x+blockIdx.x;
11 16 d[gtid] = (float)threadIdx.x*blockIdx.x;;
12 17 }
13 18
14 19 __global__ void ma4(float *a, float *b, float *c, float *d) {
15 20 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
16 21 d[gtid] = a[gtid]*b[gtid]+c[gtid];
17 (cuda-gdb) break ma4
18 (cuda-gdb) run
19 [Switching focus to CUDA kernel 0, grid 10, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 3, lane 0]
20
21 Breakpoint 1, 0x00000002009f50c8 in ma4()<<<(2097152,1,1),(128,1,1)>>> ()
22 (cuda-gdb) step
23 Single stepping until exit from function _Z3ma4v, which has no line number information.
24 [Switching focus to CUDA kernel 0, grid 10, block (0,0,0), thread (32,0,0), device 0, sm 0, warp 0, lane 0]
25
26 Breakpoint 1, 0x00000002009f50c8 in ma4()<<<(2097152,1,1),(128,1,1)>>> ()
27 (cuda-gdb) step
28 Single stepping until exit from function _Z3ma4v, which has no line number information.
29 [Switching focus to CUDA kernel 0, grid 10, block (0,0,0), thread (64,0,0), device 0, sm 0, warp 1, lane 0]
30
31 Breakpoint 1, 0x00000002009f50c8 in ma4()<<<(2097152,1,1),(128,1,1)>>> ()
32 (cuda-gdb)
Full-fledged debugger!
cuda-memcheck
AKA, el "valgrind"+"helgrind" de la GPU.
1 $ cuda-memcheck --tool memcheck --leak-check full ./a.out
2 ========= CUDA-MEMCHECK
3 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
4 ========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
5 ========= ERROR SUMMARY: 0 errors
6 $ cuda-memcheck --tool racecheck --racecheck-report all ./a.out
7 ========= CUDA-MEMCHECK
8 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
9 ========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
10 $ cuda-memcheck --tool synccheck ./a.out
11 ========= CUDA-MEMCHECK
12 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
13 ========= ERROR SUMMARY: 0 errors
14 $ cuda-memcheck --tool initcheck ./a.out
15 ========= CUDA-MEMCHECK
16 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
17 ========= ERROR SUMMARY: 0 errors
cuda-memcheck
Compilar con -G
y -lineinfo
.
Revisa errores de memoria:
- Acceso a arreglos fuera de límites (shared & global).
- Desbordamientos de pila.
- Memoria dinámica pedida y no liberada.
- Memoria dinámica liberadas dos veces.
Revisa errores de concurrencia
- Potenciales errores con memoria compartida de bloque y global (posibles falsos positivos).
Revisa problemas de bar.sync
en código divergente.
Revisa problemas de memoria global no-inicializada.
Además muestra sin instrumentar el código :
- errores de configuración de lanzamiento.
¡Cuidado el código corre 10 veces más lento!
nvprof
La herramienta definitiva para profiling.
Desde lo básico
1 $ nvprof --unified-memory-profiling off ./a.out
2 Type Time(%) Time Calls Avg Min Max Name
3 GPU activities: 94.33% 341.58ms 1 341.58ms 341.58ms 341.58ms set(float*, float*, float*, float*)
4 5.67% 20.518ms 1 20.518ms 20.518ms 20.518ms ma4(float*, float*, float*, float*)
5 API calls: 48.36% 464.15ms 4 116.04ms 16.995us 464.09ms cudaMallocManaged
6 37.72% 362.10ms 1 362.10ms 362.10ms 362.10ms cudaDeviceSynchronize
7 13.78% 132.23ms 4 33.057ms 32.244ms 34.278ms cudaFree
8 0.09% 849.05us 2 424.52us 408.82us 440.23us cuDeviceTotalMem
9 0.05% 437.32us 194 2.2540us 183ns 92.850us cuDeviceGetAttribute
10 0.01% 64.717us 2 32.358us 19.490us 45.227us cuDeviceGetName
11 0.00% 40.724us 2 20.362us 7.9810us 32.743us cudaLaunchKernel
12 0.00% 9.7530us 2 4.8760us 2.4670us 7.2860us cuDeviceGetPCIBusId
13 0.00% 2.7500us 2 1.3750us 614ns 2.1360us cuDeviceGetCount
14 0.00% 2.0390us 4 509ns 223ns 1.0400us cuDeviceGet
15 0.00% 572ns 2 286ns 250ns 322ns cuDeviceGetUuid
16 0.00% 501ns 2 250ns 166ns 335ns cudaGetLastError
Esto ya es realmente útil: da promedio y valores extremos, tanto de kernels como de API calls.
1 $ nvprof --unified-memory-profiling off --print-gpu-summary ./a.out
2 ==1968864== NVPROF is profiling process 1968864, command: ./a.out
3 ==1968864== Profiling application: ./a.out
4 ==1968864== Profiling result:
5 Type Time(%) Time Calls Avg Min Max Name
6 GPU activities: 94.15% 331.65ms 1 331.65ms 331.65ms 331.65ms set(float*, float*, float*, float*)
7 5.85% 20.591ms 1 20.591ms 20.591ms 20.591ms ma4(float*, float*, float*, float*)
Útil para:
--unified-memory-profiling
) 1 $ nvprof --unified-memory-profiling off --print-api-trace ./a.out
2 Start Duration Name
3 140.38ms 4.7270us cuDeviceGetPCIBusId
4 216.60ms 4.6880us cuDeviceGetPCIBusId
5 222.24ms 1.0580us cuDeviceGetCount
6 222.25ms 502ns cuDeviceGet
7 222.25ms 795ns cuDeviceGetAttribute
8 222.26ms 309ns cuDeviceGetAttribute
9 222.26ms 714ns cuDeviceGetAttribute
10 222.27ms 259ns cuDeviceGet
11 222.27ms 321ns cuDeviceGetAttribute
12 222.27ms 192ns cuDeviceGetAttribute
13 222.27ms 320ns cuDeviceGetAttribute
14 222.32ms 297ns cuDeviceGetCount
15 222.33ms 195ns cuDeviceGet
16 222.33ms 33.517us cuDeviceGetName
17 222.36ms 423.24us cuDeviceTotalMem
18 222.79ms 357ns cuDeviceGetAttribute
19 222.79ms 198ns cuDeviceGetAttribute
20 222.79ms 245ns cuDeviceGetAttribute
21 222.79ms 200ns cuDeviceGetAttribute
22 222.79ms 210ns cuDeviceGetAttribute
23 222.79ms 22.280us cuDeviceGetAttribute
24 222.81ms 334ns cuDeviceGetAttribute
25 222.81ms 195ns cuDeviceGetAttribute
26 222.81ms 196ns cuDeviceGetAttribute
27 222.81ms 207ns cuDeviceGetAttribute
28 222.81ms 776ns cuDeviceGetAttribute
29 222.81ms 200ns cuDeviceGetAttribute
30 222.81ms 209ns cuDeviceGetAttribute
31 222.81ms 198ns cuDeviceGetAttribute
32 222.81ms 195ns cuDeviceGetAttribute
33 222.81ms 195ns cuDeviceGetAttribute
34 222.82ms 194ns cuDeviceGetAttribute
35 222.82ms 191ns cuDeviceGetAttribute
36 222.82ms 198ns cuDeviceGetAttribute
37 222.82ms 197ns cuDeviceGetAttribute
38 222.82ms 194ns cuDeviceGetAttribute
39 222.82ms 193ns cuDeviceGetAttribute
40 222.82ms 327ns cuDeviceGetAttribute
41 222.82ms 192ns cuDeviceGetAttribute
42 222.82ms 192ns cuDeviceGetAttribute
43 222.82ms 195ns cuDeviceGetAttribute
44 222.82ms 191ns cuDeviceGetAttribute
45 222.82ms 309ns cuDeviceGetAttribute
46 222.82ms 193ns cuDeviceGetAttribute
47 222.82ms 197ns cuDeviceGetAttribute
48 222.82ms 193ns cuDeviceGetAttribute
49 222.82ms 197ns cuDeviceGetAttribute
50 222.82ms 197ns cuDeviceGetAttribute
51 222.82ms 195ns cuDeviceGetAttribute
52 222.82ms 197ns cuDeviceGetAttribute
53 222.82ms 195ns cuDeviceGetAttribute
54 222.83ms 193ns cuDeviceGetAttribute
55 222.83ms 197ns cuDeviceGetAttribute
56 222.83ms 197ns cuDeviceGetAttribute
57 222.83ms 193ns cuDeviceGetAttribute
58 222.83ms 194ns cuDeviceGetAttribute
59 222.83ms 192ns cuDeviceGetAttribute
60 222.83ms 194ns cuDeviceGetAttribute
61 222.83ms 192ns cuDeviceGetAttribute
62 222.83ms 192ns cuDeviceGetAttribute
63 222.83ms 195ns cuDeviceGetAttribute
64 222.83ms 191ns cuDeviceGetAttribute
65 222.83ms 191ns cuDeviceGetAttribute
66 222.83ms 208ns cuDeviceGetAttribute
67 222.83ms 197ns cuDeviceGetAttribute
68 222.83ms 195ns cuDeviceGetAttribute
69 222.83ms 197ns cuDeviceGetAttribute
70 222.83ms 192ns cuDeviceGetAttribute
71 222.83ms 197ns cuDeviceGetAttribute
72 222.83ms 205ns cuDeviceGetAttribute
73 222.83ms 104.18us cuDeviceGetAttribute
74 222.94ms 242ns cuDeviceGetAttribute
75 222.94ms 205ns cuDeviceGetAttribute
76 222.94ms 196ns cuDeviceGetAttribute
77 222.94ms 203ns cuDeviceGetAttribute
78 222.94ms 210ns cuDeviceGetAttribute
79 222.94ms 314ns cuDeviceGetAttribute
80 222.94ms 386ns cuDeviceGetAttribute
81 222.94ms 308ns cuDeviceGetAttribute
82 222.94ms 192ns cuDeviceGetAttribute
83 222.94ms 187ns cuDeviceGetAttribute
84 222.94ms 189ns cuDeviceGetAttribute
85 222.94ms 194ns cuDeviceGetAttribute
86 222.94ms 352ns cuDeviceGetAttribute
87 222.94ms 192ns cuDeviceGetAttribute
88 222.94ms 190ns cuDeviceGetAttribute
89 222.95ms 193ns cuDeviceGetAttribute
90 222.95ms 338ns cuDeviceGetAttribute
91 222.95ms 200ns cuDeviceGetAttribute
92 222.95ms 204ns cuDeviceGetAttribute
93 222.95ms 187ns cuDeviceGetAttribute
94 222.95ms 98.586us cuDeviceGetAttribute
95 223.05ms 219ns cuDeviceGetAttribute
96 223.05ms 220ns cuDeviceGetAttribute
97 223.05ms 196ns cuDeviceGetAttribute
98 223.05ms 197ns cuDeviceGetAttribute
99 223.05ms 196ns cuDeviceGetAttribute
100 223.05ms 186ns cuDeviceGetAttribute
101 223.05ms 195ns cuDeviceGetAttribute
102 223.05ms 2.3530us cuDeviceGetAttribute
103 223.05ms 324ns cuDeviceGetAttribute
104 223.05ms 191ns cuDeviceGetAttribute
105 223.05ms 195ns cuDeviceGetAttribute
106 223.05ms 1.6250us cuDeviceGetAttribute
107 223.06ms 190ns cuDeviceGetAttribute
108 223.06ms 287ns cuDeviceGetAttribute
109 223.06ms 227ns cuDeviceGetAttribute
110 223.06ms 355ns cuDeviceGetAttribute
111 223.06ms 190ns cuDeviceGetAttribute
112 223.06ms 282ns cuDeviceGetUuid
113 223.06ms 241ns cuDeviceGet
114 223.06ms 22.522us cuDeviceGetName
115 223.08ms 416.55us cuDeviceTotalMem
116 223.50ms 277ns cuDeviceGetAttribute
117 223.50ms 190ns cuDeviceGetAttribute
118 223.50ms 212ns cuDeviceGetAttribute
119 223.50ms 228ns cuDeviceGetAttribute
120 223.50ms 241ns cuDeviceGetAttribute
121 223.50ms 18.890us cuDeviceGetAttribute
122 223.52ms 257ns cuDeviceGetAttribute
123 223.52ms 188ns cuDeviceGetAttribute
124 223.52ms 196ns cuDeviceGetAttribute
125 223.52ms 219ns cuDeviceGetAttribute
126 223.52ms 265ns cuDeviceGetAttribute
127 223.52ms 188ns cuDeviceGetAttribute
128 223.52ms 202ns cuDeviceGetAttribute
129 223.53ms 184ns cuDeviceGetAttribute
130 223.53ms 187ns cuDeviceGetAttribute
131 223.53ms 185ns cuDeviceGetAttribute
132 223.53ms 186ns cuDeviceGetAttribute
133 223.53ms 189ns cuDeviceGetAttribute
134 223.53ms 323ns cuDeviceGetAttribute
135 223.53ms 232ns cuDeviceGetAttribute
136 223.53ms 200ns cuDeviceGetAttribute
137 223.53ms 188ns cuDeviceGetAttribute
138 223.53ms 190ns cuDeviceGetAttribute
139 223.53ms 195ns cuDeviceGetAttribute
140 223.53ms 191ns cuDeviceGetAttribute
141 223.53ms 194ns cuDeviceGetAttribute
142 223.53ms 200ns cuDeviceGetAttribute
143 223.53ms 319ns cuDeviceGetAttribute
144 223.53ms 188ns cuDeviceGetAttribute
145 223.53ms 190ns cuDeviceGetAttribute
146 223.53ms 192ns cuDeviceGetAttribute
147 223.53ms 186ns cuDeviceGetAttribute
148 223.53ms 190ns cuDeviceGetAttribute
149 223.53ms 187ns cuDeviceGetAttribute
150 223.53ms 187ns cuDeviceGetAttribute
151 223.54ms 185ns cuDeviceGetAttribute
152 223.54ms 185ns cuDeviceGetAttribute
153 223.54ms 186ns cuDeviceGetAttribute
154 223.54ms 185ns cuDeviceGetAttribute
155 223.54ms 190ns cuDeviceGetAttribute
156 223.54ms 181ns cuDeviceGetAttribute
157 223.54ms 186ns cuDeviceGetAttribute
158 223.54ms 187ns cuDeviceGetAttribute
159 223.54ms 186ns cuDeviceGetAttribute
160 223.54ms 187ns cuDeviceGetAttribute
161 223.54ms 185ns cuDeviceGetAttribute
162 223.54ms 188ns cuDeviceGetAttribute
163 223.54ms 190ns cuDeviceGetAttribute
164 223.54ms 185ns cuDeviceGetAttribute
165 223.54ms 188ns cuDeviceGetAttribute
166 223.54ms 189ns cuDeviceGetAttribute
167 223.54ms 194ns cuDeviceGetAttribute
168 223.54ms 195ns cuDeviceGetAttribute
169 223.54ms 184ns cuDeviceGetAttribute
170 223.54ms 190ns cuDeviceGetAttribute
171 223.54ms 92.752us cuDeviceGetAttribute
172 223.64ms 235ns cuDeviceGetAttribute
173 223.64ms 203ns cuDeviceGetAttribute
174 223.64ms 197ns cuDeviceGetAttribute
175 223.64ms 200ns cuDeviceGetAttribute
176 223.64ms 207ns cuDeviceGetAttribute
177 223.64ms 313ns cuDeviceGetAttribute
178 223.64ms 224ns cuDeviceGetAttribute
179 223.64ms 195ns cuDeviceGetAttribute
180 223.64ms 186ns cuDeviceGetAttribute
181 223.64ms 188ns cuDeviceGetAttribute
182 223.64ms 187ns cuDeviceGetAttribute
183 223.64ms 255ns cuDeviceGetAttribute
184 223.64ms 192ns cuDeviceGetAttribute
185 223.64ms 185ns cuDeviceGetAttribute
186 223.64ms 191ns cuDeviceGetAttribute
187 223.64ms 188ns cuDeviceGetAttribute
188 223.64ms 190ns cuDeviceGetAttribute
189 223.64ms 191ns cuDeviceGetAttribute
190 223.64ms 191ns cuDeviceGetAttribute
191 223.65ms 192ns cuDeviceGetAttribute
192 223.65ms 86.829us cuDeviceGetAttribute
193 223.73ms 223ns cuDeviceGetAttribute
194 223.73ms 324ns cuDeviceGetAttribute
195 223.73ms 195ns cuDeviceGetAttribute
196 223.73ms 191ns cuDeviceGetAttribute
197 223.74ms 190ns cuDeviceGetAttribute
198 223.74ms 188ns cuDeviceGetAttribute
199 223.74ms 193ns cuDeviceGetAttribute
200 223.74ms 614ns cuDeviceGetAttribute
201 223.74ms 390ns cuDeviceGetAttribute
202 223.74ms 197ns cuDeviceGetAttribute
203 223.74ms 189ns cuDeviceGetAttribute
204 223.74ms 396ns cuDeviceGetAttribute
205 223.74ms 211ns cuDeviceGetAttribute
206 223.74ms 223ns cuDeviceGetAttribute
207 223.74ms 208ns cuDeviceGetAttribute
208 223.74ms 189ns cuDeviceGetAttribute
209 223.74ms 192ns cuDeviceGetAttribute
210 223.74ms 226ns cuDeviceGetUuid
211 223.76ms 462.86ms cudaMallocManaged
212 686.62ms 21.517us cudaMallocManaged
213 686.65ms 15.671us cudaMallocManaged
214 686.66ms 14.889us cudaMallocManaged
215 686.68ms 32.665us cudaLaunchKernel (set(float*, float*, float*, float*) [213])
216 686.71ms 319ns cudaGetLastError
217 686.71ms 7.7970us cudaLaunchKernel (ma4(float*, float*, float*, float*) [215])
218 686.72ms 158ns cudaGetLastError
219 686.72ms 354.17ms cudaDeviceSynchronize
220 1.04090s 32.352ms cudaFree
221 1.07325s 32.345ms cudaFree
222 1.10559s 32.072ms cudaFree
223 1.13767s 32.855ms cudaFree
1 $ nvprof --unified-memory-profiling off --kernels set --events gst_inst_32bit ./a.out
2 Invocations Event Name Min Max Avg Total
3 Device "GeForce GTX 1070 (0)"
4 Kernel: set(float*, float*, float*, float*)
5 1 gst_inst_32bit 1073741824 1073741824 1073741824 1073741824
6 nicolasw@zx81:~$ nvprof --unified-memory-profiling off --kernels set --events gld_inst_32bit ./a.out
7 Invocations Event Name Min Max Avg Total
8 Device "GeForce GTX 1070 (0)"
9 Kernel: set(float*, float*, float*, float*)
10 1 gld_inst_32bit 0 0 0 0
events
y kernels
.all
.1 $ nvprof --unified-memory-profiling off --kernels ma4 --metrics ipc ./a.out
2 Invocations Metric Name Metric Description Min Max Avg
3 Device "GeForce GTX 1070 (0)"
4 Kernel: ma4(float*, float*, float*, float*)
5 1 ipc Executed IPC 0.309164 0.309164 0.309164
A partir de eventos se generan métricas (aka calculitos interesantes).
Cuidado, tanto
¡Re-ejecutan los kernels de 5 a 30 veces según el caso!
Cantidad limitada de hardware counters, la solución es multiplexar en el tiempo.
checkCudaErrors()
.getLastCudaError()
.nvprof
para ver que todos los kernels esté ejecutando.cuda-memcheck
.nvvp
, "el VTune de NVIDIA".Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |