sgemm
.Nicolás Wolovick, 20160526
ma4_error prone.cu
) 1 #include <cuda.h>
2
3 #define N (1<<28)
4 #define BLOCK_SIZE 128
5
6 __device__ float a[N], b[N], c[N], d[N];
7
8 __global__ void ma4(void) {
9 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
10 d[gtid] = a[gtid]*b[gtid]+c[gtid];
11 }
12
13 int main(void) {
14 ma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
15 cudaDeviceSynchronize();
16 return 0;
17 }
De 10 líneas de código (4 realmente útiles) y muchos errores.
Usaremos las herramientas disponibles y plantearemos buenas prácticas.
checkCudaErrors()
en CADA llamado de la biblioteca CUDA.getLastCudaError("info")
.nvprof ./a.out
nvidia-smi
.Calcular "algo" no es calcular bien.
Versión extremadamente light de lo que los CS conocemos como corrección.
Resultado constante a pesar de:
CUDA_VISIBLE_DEVICES=1 ./a.out
.ma4.cu
Kernels
1 #include <cuda.h>
2 #include <stdio.h>
3
4 #include "helper_cuda.h"
5
6 #define N (1<<28)
7 #define BLOCK_SIZE 128
8
9 __device__ float a[N], b[N], c[N], d[N];
10 float ha[N], hb[N], hc[N], hd[N];
11
12 __global__ void set(void) {
13 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
14 a[gtid] = (float)blockIdx.x;
15 b[gtid] = (float)threadIdx.x;
16 c[gtid] = (float)threadIdx.x+blockIdx.x;
17 }
18
19 __global__ void ma4(void) {
20 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
21 d[gtid] = a[gtid]*b[gtid]+c[gtid];
22 }
ma4.cu
Host code
1 int main(void) {
2 set<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
3 getLastCudaError("set() kernel failed");
4 ma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
5 getLastCudaError("ma4() kernel failed");
6 checkCudaErrors(cudaDeviceSynchronize());
7 checkCudaErrors(cudaMemcpyFromSymbol(ha, a, N*sizeof(float)));
8 checkCudaErrors(cudaMemcpyFromSymbol(hb, b, N*sizeof(float)));
9 checkCudaErrors(cudaMemcpyFromSymbol(hc, c, N*sizeof(float)));
10 checkCudaErrors(cudaMemcpyFromSymbol(hd, d, N*sizeof(float)));
11 for (size_t i=0; i<N; ++i)
12 if (hd[i] != ha[i]*hb[i]+hc[i]) {
13 printf("%d, %f!=%f*%f+%f\n", i, hd[i], ha[i], hb[i], hc[i]);
14 break;
15 }
16
17 return 0;
18 }
cuda-gdb
1 $ nvcc -g -arch=sm_52 --ptxas-options=-v --compiler-options "-O3 -mcmodel=medium" ma4.cu
2 $ cuda-gdb ./a.out
3 NVIDIA (R) CUDA Debugger
4 7.5 release
5 ...
6 (cuda-gdb) l
7 16 c[gtid] = (float)threadIdx.x+blockIdx.x;
8 17 }
9 18
10 19 __global__ void ma4(void) {
11 20 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
12 21 d[gtid] = a[gtid]*b[gtid]+c[gtid];
13 22 }
14 (cuda-gdb) break ma4()
15 (cuda-gdb) run
16 [Switching focus to CUDA kernel 0, grid 10, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 3, lane 0]
17
18 Breakpoint 1, 0x00000002009f50c8 in ma4()<<<(2097152,1,1),(128,1,1)>>> ()
19 (cuda-gdb) step
20 Single stepping until exit from function _Z3ma4v, which has no line number information.
21 [Switching focus to CUDA kernel 0, grid 10, block (0,0,0), thread (32,0,0), device 0, sm 0, warp 0, lane 0]
22
23 Breakpoint 1, 0x00000002009f50c8 in ma4()<<<(2097152,1,1),(128,1,1)>>> ()
24 (cuda-gdb) step
25 Single stepping until exit from function _Z3ma4v, which has no line number information.
26 [Switching focus to CUDA kernel 0, grid 10, block (0,0,0), thread (64,0,0), device 0, sm 0, warp 1, lane 0]
27
28 Breakpoint 1, 0x00000002009f50c8 in ma4()<<<(2097152,1,1),(128,1,1)>>> ()
29 (cuda-gdb)
Full-fledged debugger!
cuda-memcheck
AKA, el "valgrind"+"helgrind" de la GPU.
1 $ cuda-memcheck --tool memcheck --leak-check full ./a.out
2 ========= CUDA-MEMCHECK
3 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
4 ========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
5 ========= ERROR SUMMARY: 0 errors
6 $ cuda-memcheck --tool racecheck --racecheck-report all ./a.out
7 ========= CUDA-MEMCHECK
8 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
9 ========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
10 $ cuda-memcheck --tool synccheck ./a.out
11 ========= CUDA-MEMCHECK
12 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
13 ========= ERROR SUMMARY: 0 errors
14 $ cuda-memcheck --tool initcheck ./a.out
15 ========= CUDA-MEMCHECK
16 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
17 ========= ERROR SUMMARY: 0 errors
cuda-memcheck
Compilar con -G
y -lineinfo
.
Revisa errores de memoria:
- Acceso a arreglos fuera de límites (shared & global).
- Desbordamientos de pila.
- Memoria dinámica pedida y no liberada.
- Memoria dinámica liberadas dos veces.
Revisa errores de concurrencia
- Potenciales errores con memoria compartida de bloque y global (posibles falsos positivos).
Revisa problemas de bar.sync
en código divergente.
Revisa problemas de memoria global no-inicializada.
Además muestra sin instrumentar el código :
- errores de configuración de lanzamiento.
Cuidate, querete, ojito, ojete: el código corre 10 veces más lento!
Medir hardware y software counters como cuando Mika era moda.
1 export CUDA_PROFILE=1
2 export CUDA_PROFILE_CONFIG=profile.config
O directamente:
1 CUDA_PROFILE=1 ./a.out
· branch
: cantidad de saltos (por SM).
· divergent_branch
: cantidad de saltos divergentes (por SM).
· instructions
: por SM.
· gld_{32,64,128}b
: cargas de de memoria global de tamaño 32, 64 y 128 bytes.
· cta_launched
: cantidad de bloques lanzados.
· sm_cta_launched
: idem, pero dentro de un SM.
· l1_global_load_hit
...
· l1_global_load_miss
...
· gpustarttimestamp
· gpuendtimestamp
· gridsize
· threadblocksize
· dynsmemperblock
· stasmemperblock
· regperthread
· memtransferdir
· memtransfersize
· memtransferhostmemtype
· streamid
nvprof
La herramienta definitiva para profiling.
Desde lo básico
1 $ nvprof --print-gpu-summary ./a.out
2 ==12260== NVPROF is profiling process 12260, command: ./a.out
3 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
4 ==12260== Profiling application: ./a.out
5 ==12260== Profiling result:
6 Time(%) Time Calls Avg Min Max Name
7 99.03% 2.72539s 4 681.35ms 569.20ms 731.48ms [CUDA memcpy DtoH]
8 0.57% 15.599ms 1 15.599ms 15.599ms 15.599ms ma4(void)
9 0.40% 11.136ms 1 11.136ms 11.136ms 11.136ms set(void)
Esto ya es realmente útil, da promedio y barras de error!
1 $ nvprof --print-gpu-trace ./a.out
2 ==12650== NVPROF is profiling process 12650, command: ./a.out
3 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
4 ==12650== Profiling application: ./a.out
5 ==12650== Profiling result:
6 Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
7 490.06ms 11.132ms (2097152 1 1) (128 1 1) 14 0B 0B - - GeForce GTX TIT 1 7 set(void) [178]
8 501.20ms 15.600ms (2097152 1 1) (128 1 1) 11 0B 0B - - GeForce GTX TIT 1 7 ma4(void) [181]
9 516.82ms 729.13ms - - - - - 1.0000GB 1.3715GB/s GeForce GTX TIT 1 7 [CUDA memcpy DtoH]
10 1.24701s 569.52ms - - - - - 1.0000GB 1.7559GB/s GeForce GTX TIT 1 7 [CUDA memcpy DtoH]
11 1.81759s 711.01ms - - - - - 1.0000GB 1.4065GB/s GeForce GTX TIT 1 7 [CUDA memcpy DtoH]
12 2.53001s 705.89ms - - - - - 1.0000GB 1.4167GB/s GeForce GTX TIT 1 7 [CUDA memcpy DtoH]
13
14 Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
15 SSMem: Static shared memory allocated per CUDA block.
16 DSMem: Dynamic shared memory allocated per CUDA block.
Útil para:
cudaMallocHost()
. 1 $ nvprof --print-api-trace ./a.out
2 ==21569== NVPROF is profiling process 21569, command: ./a.out
3 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
4 ==21569== Profiling application: ./a.out
5 ==21569== Profiling result:
6 Start Duration Name
7 125.63ms 1.7100us cuDeviceGetCount
8 125.63ms 526ns cuDeviceGet
9 125.65ms 360ns cuDeviceGet
10 125.71ms 370ns cuDeviceGetCount
11 125.71ms 300ns cuDeviceGet
12 125.71ms 52.001us cuDeviceGetName
13 125.77ms 59.973us cuDeviceTotalMem
14 125.83ms 620ns cuDeviceGetAttribute
15 125.83ms 444ns cuDeviceGetAttribute
16 125.83ms 374ns cuDeviceGetAttribute
17 125.83ms 373ns cuDeviceGetAttribute
18 125.83ms 280ns cuDeviceGetAttribute
19 125.83ms 44.037us cuDeviceGetAttribute
20 125.88ms 333ns cuDeviceGetAttribute
21 125.88ms 324ns cuDeviceGetAttribute
22 125.88ms 317ns cuDeviceGetAttribute
23 125.88ms 316ns cuDeviceGetAttribute
24 125.88ms 307ns cuDeviceGetAttribute
25 125.88ms 327ns cuDeviceGetAttribute
26 125.88ms 293ns cuDeviceGetAttribute
27 125.88ms 293ns cuDeviceGetAttribute
28 125.88ms 317ns cuDeviceGetAttribute
29 125.88ms 307ns cuDeviceGetAttribute
30 125.88ms 297ns cuDeviceGetAttribute
31 125.88ms 297ns cuDeviceGetAttribute
32 125.89ms 293ns cuDeviceGetAttribute
33 125.89ms 290ns cuDeviceGetAttribute
34 125.89ms 290ns cuDeviceGetAttribute
35 125.89ms 287ns cuDeviceGetAttribute
36 125.89ms 300ns cuDeviceGetAttribute
37 125.89ms 290ns cuDeviceGetAttribute
38 125.89ms 300ns cuDeviceGetAttribute
39 125.89ms 327ns cuDeviceGetAttribute
40 125.89ms 286ns cuDeviceGetAttribute
41 125.89ms 307ns cuDeviceGetAttribute
42 125.89ms 293ns cuDeviceGetAttribute
43 125.89ms 340ns cuDeviceGetAttribute
44 125.89ms 293ns cuDeviceGetAttribute
45 125.89ms 296ns cuDeviceGetAttribute
46 125.89ms 297ns cuDeviceGetAttribute
47 125.90ms 287ns cuDeviceGetAttribute
48 125.90ms 304ns cuDeviceGetAttribute
49 125.90ms 294ns cuDeviceGetAttribute
50 125.90ms 287ns cuDeviceGetAttribute
51 125.90ms 286ns cuDeviceGetAttribute
52 125.90ms 284ns cuDeviceGetAttribute
53 125.90ms 290ns cuDeviceGetAttribute
54 125.90ms 287ns cuDeviceGetAttribute
55 125.90ms 294ns cuDeviceGetAttribute
56 125.90ms 297ns cuDeviceGetAttribute
57 125.90ms 286ns cuDeviceGetAttribute
58 125.90ms 293ns cuDeviceGetAttribute
59 125.90ms 310ns cuDeviceGetAttribute
60 125.90ms 290ns cuDeviceGetAttribute
61 125.91ms 310ns cuDeviceGetAttribute
62 125.91ms 276ns cuDeviceGetAttribute
63 125.91ms 300ns cuDeviceGetAttribute
64 125.91ms 316ns cuDeviceGetAttribute
65 125.91ms 297ns cuDeviceGetAttribute
66 125.91ms 290ns cuDeviceGetAttribute
67 125.91ms 297ns cuDeviceGetAttribute
68 125.91ms 293ns cuDeviceGetAttribute
69 125.91ms 325.75us cuDeviceGetAttribute
70 126.24ms 390ns cuDeviceGetAttribute
71 126.24ms 297ns cuDeviceGetAttribute
72 126.24ms 290ns cuDeviceGetAttribute
73 126.24ms 314ns cuDeviceGetAttribute
74 126.24ms 327ns cuDeviceGetAttribute
75 126.24ms 334ns cuDeviceGetAttribute
76 126.24ms 307ns cuDeviceGetAttribute
77 126.24ms 313ns cuDeviceGetAttribute
78 126.24ms 280ns cuDeviceGetAttribute
79 126.24ms 280ns cuDeviceGetAttribute
80 126.24ms 310ns cuDeviceGetAttribute
81 126.24ms 294ns cuDeviceGetAttribute
82 126.25ms 290ns cuDeviceGetAttribute
83 126.25ms 290ns cuDeviceGetAttribute
84 126.25ms 307ns cuDeviceGetAttribute
85 126.25ms 307ns cuDeviceGetAttribute
86 126.25ms 300ns cuDeviceGetAttribute
87 126.25ms 360ns cuDeviceGetAttribute
88 126.25ms 290ns cuDeviceGetAttribute
89 126.25ms 313.72us cuDeviceGetAttribute
90 126.56ms 324ns cuDeviceGetAttribute
91 126.57ms 327ns cuDeviceGetAttribute
92 126.57ms 287ns cuDeviceGetAttribute
93 126.57ms 297ns cuDeviceGetAttribute
94 126.57ms 320ns cuDeviceGetAttribute
95 126.57ms 313ns cuDeviceGetAttribute
96 126.57ms 300ns cuDeviceGetAttribute
97 126.57ms 356ns cuDeviceGet
98 126.57ms 47.127us cuDeviceGetName
99 126.62ms 61.877us cuDeviceTotalMem
100 126.68ms 307ns cuDeviceGetAttribute
101 126.68ms 380ns cuDeviceGetAttribute
102 126.68ms 286ns cuDeviceGetAttribute
103 126.68ms 320ns cuDeviceGetAttribute
104 126.68ms 276ns cuDeviceGetAttribute
105 126.68ms 44.627us cuDeviceGetAttribute
106 126.73ms 293ns cuDeviceGetAttribute
107 126.73ms 300ns cuDeviceGetAttribute
108 126.73ms 320ns cuDeviceGetAttribute
109 126.73ms 284ns cuDeviceGetAttribute
110 126.73ms 297ns cuDeviceGetAttribute
111 126.73ms 287ns cuDeviceGetAttribute
112 126.73ms 280ns cuDeviceGetAttribute
113 126.73ms 303ns cuDeviceGetAttribute
114 126.74ms 280ns cuDeviceGetAttribute
115 126.74ms 286ns cuDeviceGetAttribute
116 126.74ms 283ns cuDeviceGetAttribute
117 126.74ms 287ns cuDeviceGetAttribute
118 126.74ms 283ns cuDeviceGetAttribute
119 126.74ms 294ns cuDeviceGetAttribute
120 126.74ms 284ns cuDeviceGetAttribute
121 126.74ms 287ns cuDeviceGetAttribute
122 126.74ms 290ns cuDeviceGetAttribute
123 126.74ms 283ns cuDeviceGetAttribute
124 126.74ms 284ns cuDeviceGetAttribute
125 126.74ms 296ns cuDeviceGetAttribute
126 126.74ms 304ns cuDeviceGetAttribute
127 126.74ms 280ns cuDeviceGetAttribute
128 126.74ms 283ns cuDeviceGetAttribute
129 126.75ms 280ns cuDeviceGetAttribute
130 126.75ms 283ns cuDeviceGetAttribute
131 126.75ms 283ns cuDeviceGetAttribute
132 126.75ms 283ns cuDeviceGetAttribute
133 126.75ms 290ns cuDeviceGetAttribute
134 126.75ms 303ns cuDeviceGetAttribute
135 126.75ms 280ns cuDeviceGetAttribute
136 126.75ms 277ns cuDeviceGetAttribute
137 126.75ms 280ns cuDeviceGetAttribute
138 126.75ms 286ns cuDeviceGetAttribute
139 126.75ms 283ns cuDeviceGetAttribute
140 126.75ms 283ns cuDeviceGetAttribute
141 126.75ms 277ns cuDeviceGetAttribute
142 126.75ms 304ns cuDeviceGetAttribute
143 126.75ms 284ns cuDeviceGetAttribute
144 126.76ms 294ns cuDeviceGetAttribute
145 126.76ms 293ns cuDeviceGetAttribute
146 126.76ms 293ns cuDeviceGetAttribute
147 126.76ms 300ns cuDeviceGetAttribute
148 126.76ms 300ns cuDeviceGetAttribute
149 126.76ms 287ns cuDeviceGetAttribute
150 126.76ms 280ns cuDeviceGetAttribute
151 126.76ms 277ns cuDeviceGetAttribute
152 126.76ms 284ns cuDeviceGetAttribute
153 126.76ms 290ns cuDeviceGetAttribute
154 126.76ms 273ns cuDeviceGetAttribute
155 126.76ms 332.26us cuDeviceGetAttribute
156 127.10ms 320ns cuDeviceGetAttribute
157 127.10ms 293ns cuDeviceGetAttribute
158 127.10ms 296ns cuDeviceGetAttribute
159 127.10ms 280ns cuDeviceGetAttribute
160 127.10ms 300ns cuDeviceGetAttribute
161 127.10ms 316ns cuDeviceGetAttribute
162 127.10ms 290ns cuDeviceGetAttribute
163 127.10ms 283ns cuDeviceGetAttribute
164 127.10ms 273ns cuDeviceGetAttribute
165 127.10ms 276ns cuDeviceGetAttribute
166 127.10ms 276ns cuDeviceGetAttribute
167 127.10ms 293ns cuDeviceGetAttribute
168 127.10ms 280ns cuDeviceGetAttribute
169 127.10ms 286ns cuDeviceGetAttribute
170 127.11ms 283ns cuDeviceGetAttribute
171 127.11ms 303ns cuDeviceGetAttribute
172 127.11ms 280ns cuDeviceGetAttribute
173 127.11ms 283ns cuDeviceGetAttribute
174 127.11ms 293ns cuDeviceGetAttribute
175 127.11ms 325.94us cuDeviceGetAttribute
176 127.44ms 340ns cuDeviceGetAttribute
177 127.44ms 323ns cuDeviceGetAttribute
178 127.44ms 284ns cuDeviceGetAttribute
179 127.44ms 297ns cuDeviceGetAttribute
180 127.44ms 290ns cuDeviceGetAttribute
181 127.44ms 297ns cuDeviceGetAttribute
182 127.44ms 290ns cuDeviceGetAttribute
183 127.45ms 8.9200us cudaConfigureCall
184 127.46ms 258.49ms cudaLaunch (set(void) [178])
185 385.95ms 1.3820us cudaGetLastError
186 385.95ms 1.2990us cudaConfigureCall
187 385.96ms 12.198us cudaLaunch (ma4(void) [181])
188 385.97ms 310ns cudaGetLastError
189 385.97ms 26.722ms cudaDeviceSynchronize
190 412.69ms 746.95ms cudaMemcpyFromSymbol
191 1.15965s 572.13ms cudaMemcpyFromSymbol
192 1.73178s 714.30ms cudaMemcpyFromSymbol
193 2.44608s 707.06ms cudaMemcpyFromSymbol
194 3.18789s 89.324ms cudaDeviceReset
1 $ nvprof --events all ./a.out
2 ==22475== NVPROF is profiling process 22475, command: ./a.out
3 ==22475== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
4 ==22475== Replaying kernel "set(void)" (done)
5 ==22475== Replaying kernel "ma4(void)" (done)
6 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
7 ==22475== Profiling application: ./a.out
8 ==22475== Profiling result:
9 ==22475== Event result:
10 Invocations Event Name Min Max Avg
11 Device "GeForce GTX TITAN X (0)"
12 Kernel: set(void)
13 1 tex0_cache_sector_queries 100663296 100663296 100663296
14 1 tex1_cache_sector_queries 100663296 100663296 100663296
15 1 tex0_cache_sector_misses 100663296 100663296 100663296
16 1 tex1_cache_sector_misses 100663296 100663296 100663296
17 1 fb_subp0_read_sectors 2932 2932 2932
18 1 fb_subp1_read_sectors 3302 3302 3302
19 1 fb_subp0_write_sectors 50327278 50327278 50327278
20 1 fb_subp1_write_sectors 50323856 50323856 50323856
21 1 l2_subp0_write_sector_misses 50327158 50327158 50327158
22 1 l2_subp1_write_sector_misses 50327230 50327230 50327230
23 1 l2_subp0_read_sector_misses 2940 2940 2940
24 1 l2_subp1_read_sector_misses 2928 2928 2928
25 1 l2_subp0_read_tex_sector_queries 0 0 0
26 1 l2_subp1_read_tex_sector_queries 0 0 0
27 1 l2_subp0_write_tex_sector_queries 50331648 50331648 50331648
28 1 l2_subp1_write_tex_sector_queries 50331648 50331648 50331648
29 1 l2_subp0_read_tex_hit_sectors 0 0 0
30 1 l2_subp1_read_tex_hit_sectors 0 0 0
31 1 l2_subp0_write_tex_hit_sectors 0 0 0
32 1 l2_subp1_write_tex_hit_sectors 0 0 0
33 1 l2_subp0_total_read_sector_queries 3972 3972 3972
34 1 l2_subp1_total_read_sector_queries 3995 3995 3995
35 1 l2_subp0_total_write_sector_queries 50331653 50331653 50331653
36 1 l2_subp1_total_write_sector_queries 50331649 50331649 50331649
37 1 l2_subp0_read_sysmem_sector_queries 0 0 0
38 1 l2_subp1_read_sysmem_sector_queries 0 0 0
39 1 l2_subp0_write_sysmem_sector_queries 1 1 1
40 1 l2_subp1_write_sysmem_sector_queries 4 4 4
41 1 elapsed_cycles_sm 262592500 262592500 262592500
42 1 gld_inst_8bit 0 0 0
43 1 gld_inst_16bit 0 0 0
44 1 gld_inst_32bit 0 0 0
45 1 gld_inst_64bit 0 0 0
46 1 gld_inst_128bit 0 0 0
47 1 gst_inst_8bit 0 0 0
48 1 gst_inst_16bit 0 0 0
49 1 gst_inst_32bit 805306368 805306368 805306368
50 1 gst_inst_64bit 0 0 0
51 1 gst_inst_128bit 0 0 0
52 1 prof_trigger_00 0 0 0
53 1 prof_trigger_01 0 0 0
54 1 prof_trigger_02 0 0 0
55 1 prof_trigger_03 0 0 0
56 1 prof_trigger_04 0 0 0
57 1 prof_trigger_05 0 0 0
58 1 prof_trigger_06 0 0 0
59 1 prof_trigger_07 0 0 0
60 1 warps_launched 8388608 8388608 8388608
61 1 inst_issued0 839745496 839745496 839745496
62 1 inst_issued1 159383984 159383984 159383984
63 1 inst_issued2 50331648 50331648 50331648
64 1 inst_executed 260046848 260046848 260046848
65 1 thread_inst_executed 8321499136 8321499136 8321499136
66 1 not_predicated_off_thread_inst_executed 8321499136 8321499136 8321499136
67 1 local_store 0 0 0
68 1 local_load 0 0 0
69 1 shared_load 0 0 0
70 1 shared_store 0 0 0
71 1 shared_atom_cas 0 0 0
72 1 shared_atom 0 0 0
73 1 global_atom_cas 0 0 0
74 1 atom_count 0 0 0
75 1 gred_count 0 0 0
76 1 global_load 0 0 0
77 1 global_store 25165824 25165824 25165824
78 1 divergent_branch 0 0 0
79 1 branch 8388608 8388608 8388608
80 1 active_cycles 262453001 262453001 262453001
81 1 active_warps 1.4184e+10 1.4184e+10 1.4184e+10
82 1 active_ctas 4385695680 4385695680 4385695680
83 1 sm_cta_launched 2097152 2097152 2097152
84 1 shared_ld_bank_conflict 0 0 0
85 1 shared_st_bank_conflict 0 0 0
86 1 shared_ld_transactions 0 0 0
87 1 shared_st_transactions 0 0 0
88 Kernel: ma4(void)
89 1 tex0_cache_sector_queries 134217728 134217728 134217728
90 1 tex1_cache_sector_queries 134217728 134217728 134217728
91 1 tex0_cache_sector_misses 83886080 83886080 83886080
92 1 tex1_cache_sector_misses 83886080 83886080 83886080
93 1 fb_subp0_read_sectors 50335632 50335632 50335632
94 1 fb_subp1_read_sectors 50335689 50335689 50335689
95 1 fb_subp0_write_sectors 16777319 16777319 16777319
96 1 fb_subp1_write_sectors 16777024 16777024 16777024
97 1 l2_subp0_write_sector_misses 16777393 16777393 16777393
98 1 l2_subp1_write_sector_misses 16777131 16777131 16777131
99 1 l2_subp0_read_sector_misses 50335665 50335665 50335665
100 1 l2_subp1_read_sector_misses 50335656 50335656 50335656
101 1 l2_subp0_read_tex_sector_queries 50331648 50331648 50331648
102 1 l2_subp1_read_tex_sector_queries 50331648 50331648 50331648
103 1 l2_subp0_write_tex_sector_queries 16777216 16777216 16777216
104 1 l2_subp1_write_tex_sector_queries 16777216 16777216 16777216
105 1 l2_subp0_read_tex_hit_sectors 0 0 0
106 1 l2_subp1_read_tex_hit_sectors 0 0 0
107 1 l2_subp0_write_tex_hit_sectors 0 0 0
108 1 l2_subp1_write_tex_hit_sectors 0 0 0
109 1 l2_subp0_total_read_sector_queries 50338272 50338272 50338272
110 1 l2_subp1_total_read_sector_queries 50338244 50338244 50338244
111 1 l2_subp0_total_write_sector_queries 16777222 16777222 16777222
112 1 l2_subp1_total_write_sector_queries 16777216 16777216 16777216
113 1 l2_subp0_read_sysmem_sector_queries 0 0 0
114 1 l2_subp1_read_sysmem_sector_queries 0 0 0
115 1 l2_subp0_write_sysmem_sector_queries 0 0 0
116 1 l2_subp1_write_sysmem_sector_queries 5 5 5
117 1 elapsed_cycles_sm 431539256 431539256 431539256
118 1 gld_inst_8bit 0 0 0
119 1 gld_inst_16bit 0 0 0
120 1 gld_inst_32bit 805306368 805306368 805306368
121 1 gld_inst_64bit 0 0 0
122 1 gld_inst_128bit 0 0 0
123 1 gst_inst_8bit 0 0 0
124 1 gst_inst_16bit 0 0 0
125 1 gst_inst_32bit 268435456 268435456 268435456
126 1 gst_inst_64bit 0 0 0
127 1 gst_inst_128bit 0 0 0
128 1 prof_trigger_00 0 0 0
129 1 prof_trigger_01 0 0 0
130 1 prof_trigger_02 0 0 0
131 1 prof_trigger_03 0 0 0
132 1 prof_trigger_04 0 0 0
133 1 prof_trigger_05 0 0 0
134 1 prof_trigger_06 0 0 0
135 1 prof_trigger_07 0 0 0
136 1 warps_launched 8388608 8388608 8388608
137 1 inst_issued0 1482984855 1482984855 1482984855
138 1 inst_issued1 209716264 209716264 209716264
139 1 inst_issued2 33554432 33554432 33554432
140 1 inst_executed 276824064 276824064 276824064
141 1 thread_inst_executed 8858370048 8858370048 8858370048
142 1 not_predicated_off_thread_inst_executed 8858370048 8858370048 8858370048
143 1 local_store 0 0 0
144 1 local_load 0 0 0
145 1 shared_load 0 0 0
146 1 shared_store 0 0 0
147 1 shared_atom_cas 0 0 0
148 1 shared_atom 0 0 0
149 1 global_atom_cas 0 0 0
150 1 atom_count 0 0 0
151 1 gred_count 0 0 0
152 1 global_load 25165824 25165824 25165824
153 1 global_store 8388608 8388608 8388608
154 1 divergent_branch 0 0 0
155 1 branch 8388608 8388608 8388608
156 1 active_cycles 431418717 431418717 431418717
157 1 active_warps 2.4972e+10 2.4972e+10 2.4972e+10
158 1 active_ctas 6616480468 6616480468 6616480468
159 1 sm_cta_launched 2097152 2097152 2097152
160 1 shared_ld_bank_conflict 0 0 0
161 1 shared_st_bank_conflict 0 0 0
162 1 shared_ld_transactions 0 0 0
163 1 shared_st_transactions 0 0 0
1 $ nvprof --metrics all ./a.out
2 ==23390== NVPROF is profiling process 23390, command: ./a.out
3 ==23390== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
4 ==23390== Replaying kernel "set(void)" (done)
5 ==23390== Replaying kernel "ma4(void)" (done)
6 16909823, 16909824.000000!=132107.000000*127.000000+132234.000000
7 ==23390== Profiling application: ./a.out
8 ==23390== Profiling result:
9 ==23390== Metric result:
10 Invocations Metric Name Metric Description Min Max Avg
11 Device "GeForce GTX TITAN X (0)"
12 Kernel: set(void)
13 1 sm_efficiency Multiprocessor Activity 99.96% 99.96% 99.96%
14 1 achieved_occupancy Achieved Occupancy 0.848741 0.848741 0.848741
15 1 ipc Executed IPC 0.837694 0.837694 0.837694
16 1 issued_ipc Issued IPC 0.837898 0.837898 0.837898
17 1 inst_per_warp Instructions per warp 31.000000 31.000000 31.000000
18 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00%
19 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00%
20 1 warp_nonpred_execution_efficiency Warp Non-Predicated Execution Efficiency 100.00% 100.00% 100.00%
21 1 inst_replay_overhead Instruction Replay Overhead 0.000006 0.000006 0.000006
22 1 issue_slot_utilization Issue Slot Utilization 16.89% 16.89% 16.89%
23 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.000000 0.000000 0.000000
24 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.000000 0.000000 0.000000
25 1 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000
26 1 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000
27 1 gld_transactions_per_request Global Load Transactions Per Request 0.000000 0.000000 0.000000
28 1 gst_transactions_per_request Global Store Transactions Per Request 4.000000 4.000000 4.000000
29 1 shared_store_transactions Shared Store Transactions 0 0 0
30 1 shared_load_transactions Shared Load Transactions 0 0 0
31 1 local_load_transactions Local Load Transactions 0 0 0
32 1 local_store_transactions Local Store Transactions 0 0 0
33 1 gld_transactions Global Load Transactions 0 0 0
34 1 gst_transactions Global Store Transactions 100663296 100663296 100663296
35 1 dram_read_transactions Device Memory Read Transactions 5836 5836 5836
36 1 dram_write_transactions Device Memory Write Transactions 100654021 100654021 100654021
37 1 global_hit_rate Global Hit Rate 0.00% 0.00% 0.00%
38 1 local_hit_rate Local Hit Rate 0.00% 0.00% 0.00%
39 1 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
40 1 gst_requested_throughput Requested Global Store Throughput 269.56GB/s 269.56GB/s 269.56GB/s
41 1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
42 1 gst_throughput Global Store Throughput 269.56GB/s 269.56GB/s 269.56GB/s
43 1 dram_read_throughput Device Memory Read Throughput 16.003MB/s 16.003MB/s 16.003MB/s
44 1 dram_write_throughput Device Memory Write Throughput 269.54GB/s 269.54GB/s 269.54GB/s
45 1 tex_cache_throughput Unified Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s
46 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
47 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
48 1 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
49 1 shared_store_throughput Shared Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
50 1 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
51 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
52 1 tex_cache_transactions Unified Cache Transactions 0 0 0
53 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1)
54 1 tex_fu_utilization Texture Function Unit Utilization Low (2) Low (2) Low (2)
55 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1)
56 1 double_precision_fu_utilization Double-Precision Function Unit Utilizati Idle (0) Idle (0) Idle (0)
57 1 special_fu_utilization Special Function Unit Utilization Low (1) Low (1) Low (1)
58 1 single_precision_fu_utilization Single-Precision Function Unit Utilizati Low (2) Low (2) Low (2)
59 1 flop_count_dp Floating Point Operations(Double Precisi 0 0 0
60 1 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0
61 1 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0
62 1 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0
63 1 flop_count_sp Floating Point Operations(Single Precisi 268435456 268435456 268435456
64 1 flop_count_sp_add Floating Point Operations(Single Precisi 268435456 268435456 268435456
65 1 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0
66 1 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0
67 1 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0
68 1 inst_executed Instructions Executed 260046848 260046848 260046848
69 1 inst_issued Instructions Issued 260048384 260048384 260048384
70 1 dram_utilization Device Memory Utilization High (9) High (9) High (9)
71 1 tex_utilization Unified Cache Utilization Idle (0) Idle (0) Idle (0)
72 1 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
73 1 shared_utilization Shared Memory Utilization Idle (0) Idle (0) Idle (0)
74 1 inst_fp_32 FP Instructions(Single) 268435456 268435456 268435456
75 1 inst_fp_64 FP Instructions(Double) 0 0 0
76 1 inst_integer Integer Instructions 2952790016 2952790016 2952790016
77 1 inst_bit_convert Bit-Convert Instructions 536870912 536870912 536870912
78 1 inst_control Control-Flow Instructions 268435456 268435456 268435456
79 1 inst_compute_ld_st Load/Store Instructions 805306368 805306368 805306368
80 1 inst_misc Misc Instructions 3489660928 3489660928 3489660928
81 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0
82 1 issue_slots Issue Slots 209716736 209716736 209716736
83 1 cf_issued Issued Control-Flow Instructions 8388608 8388608 8388608
84 1 cf_executed Executed Control-Flow Instructions 8388608 8388608 8388608
85 1 ldst_issued Issued Load/Store Instructions 117440512 117440512 117440512
86 1 ldst_executed Executed Load/Store Instructions 41943040 41943040 41943040
87 1 atomic_transactions Atomic Transactions 0 0 0
88 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000
89 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 1.00% 1.00% 1.00%
90 1 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 2.97% 2.97% 2.97%
91 1 stall_memory_dependency Issue Stall Reasons (Data Request) 0.00% 0.00% 0.00%
92 1 stall_texture Issue Stall Reasons (Texture) 14.52% 14.52% 14.52%
93 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00%
94 1 stall_other Issue Stall Reasons (Other) 1.56% 1.56% 1.56%
95 1 stall_constant_memory_dependency Issue Stall Reasons (Immediate constant) 0.00% 0.00% 0.00%
96 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.08% 0.08% 0.08%
97 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 79.62% 79.62% 79.62%
98 1 stall_not_selected Issue Stall Reasons (Not Selected) 0.26% 0.26% 0.26%
99 1 sysmem_read_transactions System Memory Read Transactions 0 0 0
100 1 sysmem_write_transactions System Memory Write Transactions 5 5 5
101 1 l2_read_transactions L2 Read Transactions 8191 8191 8191
102 1 l2_write_transactions L2 Write Transactions 100663302 100663302 100663302
103 1 ecc_transactions ECC Transactions 0 0 0
104 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00%
105 1 tex_cache_hit_rate Unified Cache Hit Rate 50.00% 50.00% 50.00%
106 1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
107 1 l2_tex_write_hit_rate L2 Hit Rate (Texture Writes) 0.00% 0.00% 0.00%
108 1 l2_tex_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s
109 1 l2_tex_write_throughput L2 Throughput (Texture Writes) 269.56GB/s 269.56GB/s 269.56GB/s
110 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0
111 1 l2_tex_write_transactions L2 Transactions (Texture Writes) 100663296 100663296 100663296
112 1 l2_read_throughput L2 Throughput (Reads) 22.461MB/s 22.461MB/s 22.461MB/s
113 1 l2_write_throughput L2 Throughput (Writes) 269.56GB/s 269.56GB/s 269.56GB/s
114 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s
115 1 sysmem_write_throughput System Memory Write Throughput 14.039KB/s 14.039KB/s 14.039KB/s
116 1 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4)
117 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s
118 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0
119 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1)
120 1 ecc_throughput ECC Throughput 0.00000B/s 0.00000B/s 0.00000B/s
121 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.812312 0.812312 0.812312
122 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.34% 0.34% 0.34%
123 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00%
124 Kernel: ma4(void)
125 1 sm_efficiency Multiprocessor Activity 99.97% 99.97% 99.97%
126 1 achieved_occupancy Achieved Occupancy 0.904263 0.904263 0.904263
127 1 ipc Executed IPC 0.641850 0.641850 0.641850
128 1 issued_ipc Issued IPC 0.642156 0.642156 0.642156
129 1 inst_per_warp Instructions per warp 33.000000 33.000000 33.000000
130 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00%
131 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00%
132 1 warp_nonpred_execution_efficiency Warp Non-Predicated Execution Efficiency 100.00% 100.00% 100.00%
133 1 inst_replay_overhead Instruction Replay Overhead 0.000002 0.000002 0.000002
134 1 issue_slot_utilization Issue Slot Utilization 14.11% 14.11% 14.11%
135 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.000000 0.000000 0.000000
136 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.000000 0.000000 0.000000
137 1 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000
138 1 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000
139 1 gld_transactions_per_request Global Load Transactions Per Request 8.000000 8.000000 8.000000
140 1 gst_transactions_per_request Global Store Transactions Per Request 4.000000 4.000000 4.000000
141 1 shared_store_transactions Shared Store Transactions 0 0 0
142 1 shared_load_transactions Shared Load Transactions 0 0 0
143 1 local_load_transactions Local Load Transactions 0 0 0
144 1 local_store_transactions Local Store Transactions 0 0 0
145 1 gld_transactions Global Load Transactions 201326592 201326592 201326592
146 1 gst_transactions Global Store Transactions 33554432 33554432 33554432
147 1 dram_read_transactions Device Memory Read Transactions 100671329 100671329 100671329
148 1 dram_write_transactions Device Memory Write Transactions 33554201 33554201 33554201
149 1 global_hit_rate Global Hit Rate 50.00% 50.00% 50.00%
150 1 local_hit_rate Local Hit Rate 0.00% 0.00% 0.00%
151 1 gld_requested_throughput Requested Global Load Throughput 194.12GB/s 194.12GB/s 194.12GB/s
152 1 gst_requested_throughput Requested Global Store Throughput 64.707GB/s 64.707GB/s 64.707GB/s
153 1 gld_throughput Global Load Throughput 194.12GB/s 194.12GB/s 194.12GB/s
154 1 gst_throughput Global Store Throughput 64.707GB/s 64.707GB/s 64.707GB/s
155 1 dram_read_throughput Device Memory Read Throughput 194.14GB/s 194.14GB/s 194.14GB/s
156 1 dram_write_throughput Device Memory Write Throughput 64.707GB/s 64.707GB/s 64.707GB/s
157 1 tex_cache_throughput Unified Cache Throughput 194.12GB/s 194.12GB/s 194.12GB/s
158 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
159 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
160 1 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
161 1 shared_store_throughput Shared Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
162 1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
163 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
164 1 tex_cache_transactions Unified Cache Transactions 100663296 100663296 100663296
165 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1)
166 1 tex_fu_utilization Texture Function Unit Utilization Low (2) Low (2) Low (2)
167 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1)
168 1 double_precision_fu_utilization Double-Precision Function Unit Utilizati Idle (0) Idle (0) Idle (0)
169 1 special_fu_utilization Special Function Unit Utilization Idle (0) Idle (0) Idle (0)
170 1 single_precision_fu_utilization Single-Precision Function Unit Utilizati Low (2) Low (2) Low (2)
171 1 flop_count_dp Floating Point Operations(Double Precisi 0 0 0
172 1 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0
173 1 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0
174 1 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0
175 1 flop_count_sp Floating Point Operations(Single Precisi 536870912 536870912 536870912
176 1 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0
177 1 flop_count_sp_fma Floating Point Operations(Single Precisi 268435456 268435456 268435456
178 1 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0
179 1 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0
180 1 inst_executed Instructions Executed 276824064 276824064 276824064
181 1 inst_issued Instructions Issued 276825130 276825130 276825130
182 1 dram_utilization Device Memory Utilization High (9) High (9) High (9)
183 1 tex_utilization Unified Cache Utilization Low (2) Low (2) Low (2)
184 1 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
185 1 shared_utilization Shared Memory Utilization Idle (0) Idle (0) Idle (0)
186 1 inst_fp_32 FP Instructions(Single) 268435456 268435456 268435456
187 1 inst_fp_64 FP Instructions(Double) 0 0 0
188 1 inst_integer Integer Instructions 3489660928 3489660928 3489660928
189 1 inst_bit_convert Bit-Convert Instructions 0 0 0
190 1 inst_control Control-Flow Instructions 268435456 268435456 268435456
191 1 inst_compute_ld_st Load/Store Instructions 1073741824 1073741824 1073741824
192 1 inst_misc Misc Instructions 3758096384 3758096384 3758096384
193 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0
194 1 issue_slots Issue Slots 243270698 243270698 243270698
195 1 cf_issued Issued Control-Flow Instructions 8388608 8388608 8388608
196 1 cf_executed Executed Control-Flow Instructions 8388608 8388608 8388608
197 1 ldst_issued Issued Load/Store Instructions 150994944 150994944 150994944
198 1 ldst_executed Executed Load/Store Instructions 50331648 50331648 50331648
199 1 atomic_transactions Atomic Transactions 0 0 0
200 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000
201 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 0.66% 0.66% 0.66%
202 1 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 2.37% 2.37% 2.37%
203 1 stall_memory_dependency Issue Stall Reasons (Data Request) 93.14% 93.14% 93.14%
204 1 stall_texture Issue Stall Reasons (Texture) 0.01% 0.01% 0.01%
205 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00%
206 1 stall_other Issue Stall Reasons (Other) 0.99% 0.99% 0.99%
207 1 stall_constant_memory_dependency Issue Stall Reasons (Immediate constant) 0.00% 0.00% 0.00%
208 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.06% 0.06% 0.06%
209 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 2.34% 2.34% 2.34%
210 1 stall_not_selected Issue Stall Reasons (Not Selected) 0.43% 0.43% 0.43%
211 1 sysmem_read_transactions System Memory Read Transactions 0 0 0
212 1 sysmem_write_transactions System Memory Write Transactions 5 5 5
213 1 l2_read_transactions L2 Read Transactions 100676272 100676272 100676272
214 1 l2_write_transactions L2 Write Transactions 33554438 33554438 33554438
215 1 ecc_transactions ECC Transactions 0 0 0
216 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00%
217 1 tex_cache_hit_rate Unified Cache Hit Rate 50.00% 50.00% 50.00%
218 1 l2_tex_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
219 1 l2_tex_write_hit_rate L2 Hit Rate (Texture Writes) 0.00% 0.00% 0.00%
220 1 l2_tex_read_throughput L2 Throughput (Texture Reads) 194.12GB/s 194.12GB/s 194.12GB/s
221 1 l2_tex_write_throughput L2 Throughput (Texture Writes) 64.707GB/s 64.707GB/s 64.707GB/s
222 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 100663296 100663296 100663296
223 1 l2_tex_write_transactions L2 Transactions (Texture Writes) 33554432 33554432 33554432
224 1 l2_read_throughput L2 Throughput (Reads) 194.15GB/s 194.15GB/s 194.15GB/s
225 1 l2_write_throughput L2 Throughput (Writes) 64.707GB/s 64.707GB/s 64.707GB/s
226 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s
227 1 sysmem_write_throughput System Memory Write Throughput 10.110KB/s 10.110KB/s 10.110KB/s
228 1 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4)
229 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s
230 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0
231 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1)
232 1 ecc_throughput ECC Throughput 0.00000B/s 0.00000B/s 0.00000B/s
233 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.807974 0.807974 0.807974
234 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.49% 0.49% 0.49%
235 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00%
Cuidado, tanto
¡Re-ejecutan los kernels de 5 a 30 veces según el caso!
Cantidad limitada de hardware counters, la solución es multiplexar en el tiempo.
checkCudaErrors()
.getLastCudaError()
.nvprof
para ver que todos los kernels esté ejecutando.cuda-memcheck
.nvvp
(se lo dejo a Charlie).sgemm
N×N hilos, cada uno c[i][j] = a[i][]*b[][j]
, 2N
FLOPS, 2N
Memoria.
(NVIDIA, NVIDIA CUDA C Programming Guide)
N×N
definidas en runtime.BX×BY
definidos en runtime.N
múltiplo de BX
o BY
.Kernel
1 // 2D to 1D bijection
2 #define IX(i,j) ((i)*(N)+(j))
3
4 __global__ void sgemm(const uint N, float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
5 const uint i = blockIdx.y*blockDim.y + threadIdx.y;
6 const uint j = blockIdx.x*blockDim.x + threadIdx.x;
7 if (i<N && j<N)
8 for (uint k=0; k<N; ++k)
9 c[IX(i,j)] += a[IX(i,k)] * b[IX(k,j)];
10 }
Notar:
» Uso de __restrict__
.
» blockDim.{x,y}
para conocer el tamaño del bloque en runtime.
» Tipos definidos en CUDA: uint
.
1 checkCudaErrors(cudaMalloc(&d_a, SIZE * sizeof(float)));
2 checkCudaErrors(cudaMalloc(&d_b, SIZE * sizeof(float)));
3 checkCudaErrors(cudaMalloc(&d_c, SIZE * sizeof(float)));
4 h_a = (float *) calloc(SIZE, sizeof(float));
5 h_b = (float *) calloc(SIZE, sizeof(float));
6 h_c = (float *) calloc(SIZE, sizeof(float));
7 assert(d_a && d_b && d_c && h_a && h_b && h_c);
1 // integer ceiling division
2 #define DIV_CEIL(a,b) (((a)+(b)-1)/(b))
3 dim3 grid_size(DIV_CEIL(N,BX), DIV_CEIL(N,BY));
4 dim3 block_size(BX,BY);
5 setmm<<<grid_size, block_size>>>(N, d_a, d_b, d_c);
6 getLastCudaError("setmm kernel failed");
7 sgemm<<<grid_size, block_size>>>(N, d_a, d_b, d_c);
8 getLastCudaError("sgemm kernel failed");
9 cudaDeviceSynchronize();
Notar:
» Uso de DIV_CEIL
para que haya suficientes bloques.
» Revisión de errores.
» Uso de calloc
para que pida realmente y esté a 0.
1 checkCudaErrors(cudaMemcpy(h_a, d_a, SIZE*sizeof(float), cudaMemcpyDefault));
2 checkCudaErrors(cudaMemcpy(h_b, d_b, SIZE*sizeof(float), cudaMemcpyDefault));
3 checkCudaErrors(cudaMemcpy(h_c, d_c, SIZE*sizeof(float), cudaMemcpyDefault));
4 double max_diff = 0.0;
5 for (unsigned int i=0; i<N; ++i) {
6 for (unsigned int j=0; j<N; ++j) {
7 float cij = 0.0f;
8 for (unsigned int k=0; k<N; ++k)
9 cij += h_a[IX(i,k)] * h_b[IX(k,j)];
10 max_diff = MAX(max_diff, abs(cij-h_c[IX(i,j)]));
11 }
12 }
13 printf("max_diff: %f\n", max_diff);
Notar:
» No tenemos que decir en que dirección copiamos la memoria
cudaMemcpyDefault
usa unified pointers para saber de que lado estamos.
» ¡Pedimos cualquier cosa menos igualdad!
1 $ nvprof ./sgemm 256 16 16
2 ==1336== NVPROF is profiling process 1336, command: ./sgemm 256 16 16
3 max_diff: 0.000023
4 ==1336== Profiling result:
5 Time(%) Time Calls Avg Min Max Name
6 53.79% 111.74us 1 111.74us 111.74us 111.74us sgemm(unsigned int, float*, float*, float*)
7 43.47% 90.305us 3 30.101us 22.112us 34.145us [CUDA memcpy DtoH]
8 2.74% 5.6960us 1 5.6960us 5.6960us 5.6960us setmm(unsigned int, float*, float*, float*)
9 $ cuda-memcheck ./sgemm 256 16 16
10 ========= CUDA-MEMCHECK
11 max_diff: 0.000023
12 ========= ERROR SUMMARY: 0 errors
13 $ cuda-memcheck --tool racecheck ./sgemm 256 16 16
14 ========= CUDA-MEMCHECK
15 max_diff: 0.000023
16 ========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
17 $ cuda-memcheck --tool initcheck ./sgemm 256 16 16
18 ========= CUDA-MEMCHECK
19 max_diff: 0.000023
20 ========= ERROR SUMMARY: 0 errors
21 $ cuda-memcheck --tool synccheck ./sgemm 256 16 16
22 ========= CUDA-MEMCHECK
23 max_diff: 0.000023
24 ========= ERROR SUMMARY: 0 errors
» Ejecuta el kernel.
» La diferencia con la versión de especificación es pequeña.
» No hay errores de memoria.
» No hay potenciales condiciones de carrera, ni problemas de inicialización ni de bar.sync
.
» Vamos con pies de plomo.
sgemm N BX BY
1 $ ./sgemm 1024 64 64
2 Cuda error: setmm kernel failed in file 'sgemm.cu' in line 54 : invalid configuration argument.
3 $ ./sgemm 1024 32 32
4 max_diff: 0.000092
5 $ ./sgemm 512 32 32
6 max_diff: 0.000046
7 $ ./sgemm 2048 32 32
8 max_diff: 0.000183
9 $ ./sgemm 256 1 1
10 max_diff: 0.000023
cuda-memcheck
.2048 32 32
la comprobación CPU no es costosa, es costosíiiiisima.1 $ nvcc -arch=sm_52 sgemm.cu -ptx
sgemm
1 BB7_2:
2 mul.wide.u32 %rd9, %r21, 4;
3 add.s64 %rd10, %rd2, %rd9;
4 mul.wide.u32 %rd11, %r22, 4;
5 add.s64 %rd12, %rd1, %rd11;
6 ld.global.nc.f32 %f4, [%rd12];
7 ld.global.nc.f32 %f5, [%rd10];
8 fma.rn.f32 %f6, %f5, %f4, %f6;
9 add.s32 %r22, %r22, %r17;
10 add.s32 %r21, %r21, 1;
11 add.s32 %r23, %r23, 1;
12 setp.lt.u32 %p6, %r23, %r17;
13 @%p6 bra BB7_2;
14
15 st.global.f32 [%rd3], %f6;
También con:
1 $ cuobjdump -ptx sgemm
1 $ cuobjdump -sass sgemm
Vemos el assembler
1 /*0588*/ @!P0 BRA 0x638; /* 0xe24000000a88000f */
2 /*0590*/ SHL R8, R7.reuse, 0x2; /* 0x3848000000270708 */
3 /*0598*/ SHR.U32 R9, R7, 0x1e; /* 0x3828000001e70709 */
4 /* 0x001fd400fe2207f4 */
5 /*05a8*/ SHL R6, R0.reuse, 0x2; /* 0x3848000000270006 */
6 /*05b0*/ IADD R8.CC, R8, c[0x0][0x150]; /* 0x4c10800005470808 */
7 /*05b8*/ SHR.U32 R11, R0, 0x1e; /* 0x3828000001e7000b */
8 /* 0x0001d800fe0007e2 */
9 /*05c8*/ IADD.X R9, R9, c[0x0][0x154]; /* 0x4c10080005570909 */
10 /*05d0*/ { IADD R10.CC, R6, c[0x0][0x148]; /* 0x4c1080000527060a */
11 /*05d8*/ LDG.E.CI R6, [R8]; } /* 0xeed4a00000070806 */
12 /* 0x001fd8002e2007f2 */
13 /*05e8*/ IADD.X R11, R11, c[0x0][0x14c]; /* 0x4c10080005370b0b */
14 /*05f0*/ LDG.E.CI R11, [R10]; /* 0xeed4a00000070a0b */
15 /*05f8*/ IADD32I R4, R4, 0x1; /* 0x1c00000000170404 */
16 /* 0x001fec00fc2007f1 */
17 /*0608*/ ISETP.LT.U32.AND P0, PT, R4, c[0x0][0x140], PT; /* 0x4b62038005070407 */
18 /*0610*/ IADD R7, R7, c[0x0][0x140]; /* 0x4c10000005070707 */
19 /*0618*/ IADD32I R0, R0, 0x1; /* 0x1c00000000170000 */
20 /* 0x0403c403ffa147f0 */
21 /*0628*/ { FFMA R5, R11, R6, R5; /* 0x5980028000670b05 */
22 /*0630*/ @P0 BRA 0x590; } /* 0xe2400ffff580000f */
23 /*0638*/ STG.E [R2], R5; /* 0xeedc200000070205 */
» Distinto al PTX.
» Reorden de instrucciones.
» Uso de ldg
para lectura de memoria read-only a través de la caché de texturas.
-ptxas-options=-v
1 $ nvcc sgemm.cu -arch=sm_52 --ptxas-options=-v -o sgemm.o
2 ptxas info : 0 bytes gmem, 24 bytes cmem[3]
3 ptxas info : Compiling entry function '_Z5sgemmjPfS_S_' for 'sm_52'
4 ptxas info : Function properties for _Z5sgemmjPfS_S_
5 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
6 ptxas info : Used 28 registers, 352 bytes cmem[0]
7 ptxas info : Compiling entry function '_Z5setmmjPfS_S_' for 'sm_52'
8 ptxas info : Function properties for _Z5setmmjPfS_S_
9 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
10 ptxas info : Used 19 registers, 352 bytes cmem[0], 40 bytes cmem[2]
1 $ nvprof --print-gpu-trace ./sgemm 1024 32 32
2 ==6805== NVPROF is profiling process 6805, command: ./sgemm 1024 32 32
3 max_diff: 0.000092
4 ==6805== Profiling application: ./sgemm 1024 32 32
5 ==6805== Profiling result:
6 Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
7 365.27ms 100.29us (32 32 1) (32 32 1) 19 0B 0B - - GeForce GTX TIT 1 7 setmm(unsigned int, float*, float*, float*) [185]
8 365.37ms 6.1918ms (32 32 1) (32 32 1) 28 0B 0B - - GeForce GTX TIT 1 7 sgemm(unsigned int, float*, float*, float*) [192]
Ambas informan 19 registros para setmm
y 28 registros para sgemm
.
1 $ nvprof --metrics ipc,flops_sp,gld_throughput,gst_throughput ./sgemm 1024 32 32
2 ==8208== NVPROF is profiling process 8208, command: ./sgemm 1024 32 32
3 ==8208== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
4 ==8208== Replaying kernel "setmm(unsigned int, float*, float*, float*)" (done)
5 ==8208== Replaying kernel "sgemm(unsigned int, float*, float*, float*)" (done)
6 max_diff: 0.000092
7 ==8208== Profiling application: ./sgemm 1024 32 32
8 ==8208== Profiling result:
9 ==8208== Metric result:
10 Invocations Metric Name Metric Description Min Max Avg
11 Device "GeForce GTX TITAN X (0)"
12 Kernel: sgemm(unsigned int, float*, float*, float*)
13 1 ipc 4.236410 4.236410 4.236410
14 1 gld_throughput 2e+03GB/s 2e+03GB/s 2e+03GB/s
15 1 gst_throughput 644.16MB/s 644.16MB/s 644.16MB/s
16 1 flop_count_sp 2147483648 2147483648 2147483648
17 Kernel: setmm(unsigned int, float*, float*, float*)
18 1 ipc 3.174295 3.174295 3.174295
19 1 gld_throughput 0.00000B/s 0.00000B/s 0.00000B/s
20 1 gst_throughput 114.57GB/s 114.57GB/s 114.57GB/s
21 1 flop_count_sp 39836070 39836070 39836070
» Pésimo uso del BW de memoria: 0.64 GiB/s sobre un total de ~200 GiB/s.
» Pésimo uso de la potencia de cálculo: 2.14 GFLOPS en sp! (tiene 6 TFLOPS sp).
¿Quiénes somos?: sgeeeeemmm
!!!
¿Como es nuestra intensidad aritmética?: ¡¡¡lineal al tamaño del problema!!!
¿Como estamos funcionando?: ¡pésimooooo!
gputime
vs. BX
,BY
(C2070, Fermi, 2012)gputime
en µs
gputime
vs. BX
,BY
(C2070, zoom)gputime
en µs
BX
=16,24,32.gputime
= 33.264ms para 2×1024^3 FLOP: 60 GFLOPS>>> ((2*1024**3)/0.033) / (1<<30)
60.6060606060606
1 $ grep -v "^ " sgemm-c2070.dat | sort -n -k 3 | head -16
2 32 5 33264.832
3 32 6 33275.199
4 32 24 33732.734
5 ...
6 16 10 35538.559
7 32 3 35789.473
8 32 8 35800.672
a
se lee y la c
se escribe de a warps de 32 hilos accediendo a 128 bytes consecutivos de la memoria.set logscale zcb 2
gputime
en µs
El top-16
1 $ grep -v "^ " sgemm-k40.dat | sort -n -k 3 | head -16
2 32 7 15819.616
3 16 14 15915.968
4 16 16 16027.520
5 32 8 16068.768
6 32 16 16092.128
7 8 32 16112.224
8 16 18 16115.968
9 32 9 16156.960
10 4 32 16188.736
11 9 32 16195.232
12 18 16 16196.064
13 8 28 16201.536
14 7 32 16203.808
15 16 8 16228.736
16 8 16 16229.504
17 17 15 16255.296
gputime
en µs
gputime
en µs
1 $ grep -v "^ " sgemm-gtxtitanx.dat | sort -n -k 3 | head -16
2 32 26 6211.488
3 32 24 6227.584
4 32 22 6239.200
5 32 32 6240.192
6 32 30 6257.536
7 32 28 6259.072
8 32 17 6271.136
9 32 20 6277.696
10 32 21 6285.920
11 32 12 6304.640
12 32 29 6306.304
13 16 30 6311.392
14 32 19 6316.480
15 32 16 6316.640
16 32 18 6317.408
17 32 31 6329.440
gputime
en µs
logscale
aprovechar bien la paleta.warpsize/2
, warpsize
).Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |