sgemm
.Nicolás Wolovick, 20140527
ma4_error prone.cu
) 1 #include <cuda.h>
2
3 #define N (1<<28)
4 #define BLOCK_SIZE 128
5
6 __device__ float a[N], b[N], c[N], d[N];
7
8 __global__ void ma4(void) {
9 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
10 d[gtid] = a[gtid]*b[gtid]+c[gtid];
11 }
12
13 int main(void) {
14 ma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
15 cudaDeviceSynchronize();
16 return 0;
17 }
De 10 líneas de código (4 realmente útiles) y muchos errores.
Usaremos las herramientas disponibles y plantearemos buenas prácticas.
checkCudaErrors()
en CADA llamado de la biblioteca CUDA.getLastCudaError("info")
.cuda_profile_0.log
nvprof ./a.out
nvidia-smi
.Calcular "algo" no es calcular bien.
Versión extremadamente light de lo que los CS conocemos como corrección.
ma4.cu
Kernels
1 #include <cuda.h>
2 #include <stdio.h>
3
4 #include "helper_cuda.h"
5
6 #define N (1<<28)
7 #define BLOCK_SIZE 128
8
9 __device__ float a[N], b[N], c[N], d[N];
10 float ha[N], hb[N], hc[N], hd[N];
11
12 __global__ void set(void) {
13 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
14 a[gtid] = (float)blockIdx.x;
15 b[gtid] = (float)threadIdx.x;
16 c[gtid] = (float)threadIdx.x+blockIdx.x;
17 }
18
19 __global__ void ma4(void) {
20 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
21 d[gtid] = a[gtid]*b[gtid]+c[gtid];
22 }
ma4.cu
Host code
1 int main(void) {
2 set<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
3 getLastCudaError("set() kernel failed");
4 ma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
5 getLastCudaError("ma4() kernel failed");
6 checkCudaErrors(cudaDeviceSynchronize());
7 checkCudaErrors(cudaMemcpyFromSymbol(ha, a, N*sizeof(float)));
8 checkCudaErrors(cudaMemcpyFromSymbol(hb, b, N*sizeof(float)));
9 checkCudaErrors(cudaMemcpyFromSymbol(hc, c, N*sizeof(float)));
10 checkCudaErrors(cudaMemcpyFromSymbol(hd, d, N*sizeof(float)));
11 for (unsigned int i=0; i<N; ++i)
12 if (hd[i] != ha[i]*hb[i]+hc[i]) {
13 printf("%d, %f!=%f*%f+%f\n", i, hd[i], ha[i], hb[i], hc[i]);
14 break;
15 }
16
17 return 0;
18 }
cuda-gdb
1 $ nvcc -g -arch=sm_35 --ptxas-options=-v --compiler-options "-O3 -mcmodel=medium" ma4.cu
2 $ CUDA_DEBUGGER_SOFTWARE_PREEMPTION=1 cuda-gdb ./a.out
3 (cuda-gdb) l
4 16 c[gtid] = (float)threadIdx.x+blockIdx.x;
5 17 }
6 18
7 19 __global__ void ma4(void) {
8 20 unsigned int gtid = blockIdx.x*blockDim.x + threadIdx.x;
9 21 d[gtid] = a[gtid]*b[gtid]+c[gtid];
10 22 }
11 (cuda-gdb) break ma4()
12 Breakpoint 1 at 0x402970: file /opt/cuda/bin/..//include/cuda_runtime.h, line 1231.
13 (cuda-gdb) run
14 Starting program: /home/nicolasw/teoricos/Clase16-20140522/a.out
15 [Thread debugging using libthread_db enabled]
16 Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
17 [New Thread 0x7ffff5a69700 (LWP 15809)]
18 [Switching focus to CUDA kernel 0, grid 6, block (0,0,0), thread (0,0,0), device 0, sm 14, warp 0, lane 0]
19
20 Breakpoint 1, 0x0000000200a01870 in ma4()<<<(262144,1,1),(1024,1,1)>>> ()
21 (cuda-gdb) step
22 Single stepping until exit from function _Z3ma4v,
23 which has no line number information.
24 [Switching focus to CUDA kernel 0, grid 6, block (0,0,0), thread (32,0,0), device 0, sm 13, warp 34, lane 0]
25
26 Breakpoint 1, 0x0000000200a01870 in ma4()<<<(262144,1,1),(1024,1,1)>>> ()
27 (cuda-gdb) step
28 Single stepping until exit from function _Z3ma4v,
29 which has no line number information.
30 [Switching focus to CUDA kernel 0, grid 6, block (0,0,0), thread (64,0,0), device 0, sm 10, warp 33, lane 0]
31
32 Breakpoint 1, 0x0000000200a01870 in ma4()<<<(262144,1,1),(1024,1,1)>>> ()
Full-fledged debugger!
Aka, el "valgrind"+"helgrind" de la GPU.
1 $ cuda-memcheck --leak-check full ./a.out
2 ========= CUDA-MEMCHECK
3 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
4 ========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
5 ========= ERROR SUMMARY: 0 errors
6 $ cuda-memcheck --racecheck-report all ./a.out
7 ========= CUDA-MEMCHECK
8 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
9 ========= ERROR SUMMARY: 0 errors
Revisa errores de memoria
- Acceso a arreglos fuera de límites (shared & global).
- Desbordamientos de pila.
- Memoria dinámica pedida y no liberada.
- Memoria dinámica liberadas dos veces.
Revisa errores de concurrencia
- Potenciales errores con memoria compartida de bloque y global (posibles false positives).
Además muestra sin instrumentar el código :
- errores de configuración de lanzamiento.
Cuidate, querete, ojito, ojete: el código corre 10 veces más lento!
Medir hardware y software counters como cuando Mika era moda.
1 export CUDA_PROFILE=1
2 export CUDA_PROFILE_CONFIG=profile.config
O directamente:
1 CUDA_PROFILE=1 ./a.out
· branch
: cantidad de saltos (por SM).
· divergent_branch
: cantidad de saltos divergentes (por SM).
· instructions
: por SM.
· gld_{32,64,128}b
: cargas de de memoria global de tamaño 32, 64 y 128 bytes.
· cta_launched
: cantidad de bloques lanzados.
· sm_cta_launched
: idem, pero dentro de un SM.
· l1_global_load_hit
...
· l1_global_load_miss
...
· gpustarttimestamp
· gpuendtimestamp
· gridsize
· threadblocksize
· dynsmemperblock
· stasmemperblock
· regperthread
· memtransferdir
· memtransfersize
· memtransferhostmemtype
· streamid
nvprof
La herramienta definitiva para profiling.
Desde lo básico
1 $ nvprof --print-gpu-summary ./a.out
2 ==9733== NVPROF is profiling process 9733, command: ./a.out
3 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
4 ==9733== Profiling application: ./a.out
5 ==9733== Profiling result:
6 Time(%) Time Calls Avg Min Max Name
7 97.80% 1.78398s 4 446.00ms 443.58ms 451.05ms [CUDA memcpy DtoH]
8 1.33% 24.234ms 1 24.234ms 24.234ms 24.234ms ma4(void)
9 0.87% 15.887ms 1 15.887ms 15.887ms 15.887ms set(void)
Esto ya es realmente útil, da promedio y barras de error!
1 $ nvprof --print-gpu-trace ./a.out
2 ==11085== NVPROF is profiling process 11085, command: ./a.out
3 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
4 ==11085== Profiling application: ./a.out
5 ==11085== Profiling result:
6 Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
7 351.97ms 15.890ms (262144 1 1) (1024 1 1) 12 0B 0B - - Tesla K40c (0) 1 7 set(void) [178]
8 367.87ms 24.236ms (262144 1 1) (1024 1 1) 16 0B 0B - - Tesla K40c (0) 1 7 ma4(void) [181]
9 392.13ms 457.74ms - - - - - 1.0737GB 2.3458GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
10 850.59ms 444.59ms - - - - - 1.0737GB 2.4151GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
11 1.29588s 439.94ms - - - - - 1.0737GB 2.4406GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
12 1.73653s 441.20ms - - - - - 1.0737GB 2.4337GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
13
14 Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
15 SSMem: Static shared memory allocated per CUDA block.
16 DSMem: Dynamic shared memory allocated per CUDA block.
Útil para:
cudaMallocHost()
. 1 $ nvprof --print-api-trace ./a.out
2 ==13525== NVPROF is profiling process 13525, command: ./a.out
3 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
4 ==13525== Profiling application: ./a.out
5 ==13525== Profiling result:
6 Start Duration Name
7 169.90ms 1.0660us cuDeviceGetCount
8 169.90ms 298ns cuDeviceGet
9 169.91ms 437ns cuDeviceGet
10 169.92ms 395ns cuDeviceGetCount
11 169.94ms 582ns cuDeviceGet
12 169.94ms 38.361us cuDeviceGetName
13 169.98ms 45.519us cuDeviceTotalMem
14 170.03ms 397ns cuDeviceGetAttribute
15 170.03ms 189ns cuDeviceGetAttribute
16 170.03ms 212ns cuDeviceGetAttribute
17 170.03ms 211ns cuDeviceGetAttribute
18 170.03ms 184ns cuDeviceGetAttribute
19 170.03ms 29.350us cuDeviceGetAttribute
20 170.06ms 219ns cuDeviceGetAttribute
21 170.06ms 191ns cuDeviceGetAttribute
22 170.06ms 190ns cuDeviceGetAttribute
23 170.06ms 187ns cuDeviceGetAttribute
24 170.06ms 187ns cuDeviceGetAttribute
25 170.06ms 258ns cuDeviceGetAttribute
26 170.07ms 247ns cuDeviceGetAttribute
27 170.07ms 186ns cuDeviceGetAttribute
28 170.07ms 184ns cuDeviceGetAttribute
29 170.07ms 185ns cuDeviceGetAttribute
30 170.07ms 190ns cuDeviceGetAttribute
31 170.07ms 190ns cuDeviceGetAttribute
32 170.07ms 193ns cuDeviceGetAttribute
33 170.07ms 246ns cuDeviceGetAttribute
34 170.07ms 254ns cuDeviceGetAttribute
35 170.07ms 254ns cuDeviceGetAttribute
36 170.07ms 247ns cuDeviceGetAttribute
37 170.07ms 189ns cuDeviceGetAttribute
38 170.07ms 187ns cuDeviceGetAttribute
39 170.07ms 184ns cuDeviceGetAttribute
40 170.07ms 184ns cuDeviceGetAttribute
41 170.07ms 187ns cuDeviceGetAttribute
42 170.07ms 184ns cuDeviceGetAttribute
43 170.07ms 248ns cuDeviceGetAttribute
44 170.07ms 250ns cuDeviceGetAttribute
45 170.07ms 195ns cuDeviceGetAttribute
46 170.07ms 196ns cuDeviceGetAttribute
47 170.08ms 191ns cuDeviceGetAttribute
48 170.08ms 190ns cuDeviceGetAttribute
49 170.08ms 190ns cuDeviceGetAttribute
50 170.08ms 196ns cuDeviceGetAttribute
51 170.08ms 207ns cuDeviceGetAttribute
52 170.08ms 185ns cuDeviceGetAttribute
53 170.08ms 185ns cuDeviceGetAttribute
54 170.08ms 186ns cuDeviceGetAttribute
55 170.08ms 190ns cuDeviceGetAttribute
56 170.08ms 190ns cuDeviceGetAttribute
57 170.08ms 185ns cuDeviceGetAttribute
58 170.08ms 185ns cuDeviceGetAttribute
59 170.08ms 184ns cuDeviceGetAttribute
60 170.08ms 184ns cuDeviceGetAttribute
61 170.08ms 190ns cuDeviceGetAttribute
62 170.08ms 196ns cuDeviceGetAttribute
63 170.08ms 195ns cuDeviceGetAttribute
64 170.08ms 195ns cuDeviceGetAttribute
65 170.08ms 199ns cuDeviceGetAttribute
66 170.08ms 187ns cuDeviceGetAttribute
67 170.08ms 184ns cuDeviceGetAttribute
68 170.08ms 185ns cuDeviceGetAttribute
69 170.09ms 114.48us cuDeviceGetAttribute
70 170.20ms 223ns cuDeviceGetAttribute
71 170.20ms 190ns cuDeviceGetAttribute
72 170.20ms 191ns cuDeviceGetAttribute
73 170.20ms 196ns cuDeviceGetAttribute
74 170.20ms 198ns cuDeviceGetAttribute
75 170.20ms 204ns cuDeviceGetAttribute
76 170.20ms 251ns cuDeviceGetAttribute
77 170.20ms 187ns cuDeviceGetAttribute
78 170.20ms 184ns cuDeviceGetAttribute
79 170.20ms 184ns cuDeviceGetAttribute
80 170.20ms 184ns cuDeviceGetAttribute
81 170.21ms 192ns cuDeviceGetAttribute
82 170.21ms 191ns cuDeviceGetAttribute
83 170.21ms 185ns cuDeviceGetAttribute
84 170.21ms 184ns cuDeviceGetAttribute
85 170.21ms 207ns cuDeviceGetAttribute
86 170.21ms 183ns cuDeviceGetAttribute
87 170.21ms 185ns cuDeviceGetAttribute
88 170.21ms 210ns cuDeviceGetAttribute
89 170.21ms 113.97us cuDeviceGetAttribute
90 170.32ms 254ns cuDeviceGetAttribute
91 170.32ms 265ns cuDeviceGetAttribute
92 170.32ms 191ns cuDeviceGetAttribute
93 170.32ms 185ns cuDeviceGetAttribute
94 170.33ms 204ns cuDeviceGetAttribute
95 170.33ms 191ns cuDeviceGetAttribute
96 170.33ms 190ns cuDeviceGetAttribute
97 170.33ms 241ns cuDeviceGet
98 170.33ms 30.514us cuDeviceGetName
99 170.36ms 41.469us cuDeviceTotalMem
100 170.40ms 261ns cuDeviceGetAttribute
101 170.40ms 185ns cuDeviceGetAttribute
102 170.40ms 185ns cuDeviceGetAttribute
103 170.40ms 206ns cuDeviceGetAttribute
104 170.40ms 186ns cuDeviceGetAttribute
105 170.40ms 29.037us cuDeviceGetAttribute
106 170.43ms 209ns cuDeviceGetAttribute
107 170.43ms 184ns cuDeviceGetAttribute
108 170.43ms 184ns cuDeviceGetAttribute
109 170.43ms 198ns cuDeviceGetAttribute
110 170.43ms 201ns cuDeviceGetAttribute
111 170.43ms 193ns cuDeviceGetAttribute
112 170.44ms 218ns cuDeviceGetAttribute
113 170.44ms 184ns cuDeviceGetAttribute
114 170.44ms 188ns cuDeviceGetAttribute
115 170.44ms 184ns cuDeviceGetAttribute
116 170.44ms 184ns cuDeviceGetAttribute
117 170.44ms 183ns cuDeviceGetAttribute
118 170.44ms 185ns cuDeviceGetAttribute
119 170.44ms 184ns cuDeviceGetAttribute
120 170.44ms 196ns cuDeviceGetAttribute
121 170.44ms 191ns cuDeviceGetAttribute
122 170.44ms 196ns cuDeviceGetAttribute
123 170.44ms 192ns cuDeviceGetAttribute
124 170.44ms 191ns cuDeviceGetAttribute
125 170.44ms 189ns cuDeviceGetAttribute
126 170.44ms 183ns cuDeviceGetAttribute
127 170.44ms 186ns cuDeviceGetAttribute
128 170.44ms 184ns cuDeviceGetAttribute
129 170.44ms 238ns cuDeviceGetAttribute
130 170.44ms 207ns cuDeviceGetAttribute
131 170.44ms 187ns cuDeviceGetAttribute
132 170.44ms 190ns cuDeviceGetAttribute
133 170.44ms 189ns cuDeviceGetAttribute
134 170.44ms 189ns cuDeviceGetAttribute
135 170.45ms 193ns cuDeviceGetAttribute
136 170.45ms 193ns cuDeviceGetAttribute
137 170.45ms 192ns cuDeviceGetAttribute
138 170.45ms 182ns cuDeviceGetAttribute
139 170.45ms 183ns cuDeviceGetAttribute
140 170.45ms 183ns cuDeviceGetAttribute
141 170.45ms 189ns cuDeviceGetAttribute
142 170.45ms 183ns cuDeviceGetAttribute
143 170.45ms 188ns cuDeviceGetAttribute
144 170.45ms 182ns cuDeviceGetAttribute
145 170.45ms 182ns cuDeviceGetAttribute
146 170.45ms 184ns cuDeviceGetAttribute
147 170.45ms 182ns cuDeviceGetAttribute
148 170.45ms 191ns cuDeviceGetAttribute
149 170.45ms 190ns cuDeviceGetAttribute
150 170.45ms 191ns cuDeviceGetAttribute
151 170.45ms 191ns cuDeviceGetAttribute
152 170.45ms 184ns cuDeviceGetAttribute
153 170.45ms 184ns cuDeviceGetAttribute
154 170.45ms 192ns cuDeviceGetAttribute
155 170.45ms 113.84us cuDeviceGetAttribute
156 170.57ms 226ns cuDeviceGetAttribute
157 170.57ms 183ns cuDeviceGetAttribute
158 170.57ms 189ns cuDeviceGetAttribute
159 170.57ms 190ns cuDeviceGetAttribute
160 170.57ms 208ns cuDeviceGetAttribute
161 170.57ms 193ns cuDeviceGetAttribute
162 170.57ms 199ns cuDeviceGetAttribute
163 170.57ms 207ns cuDeviceGetAttribute
164 170.57ms 183ns cuDeviceGetAttribute
165 170.57ms 183ns cuDeviceGetAttribute
166 170.57ms 186ns cuDeviceGetAttribute
167 170.57ms 183ns cuDeviceGetAttribute
168 170.57ms 183ns cuDeviceGetAttribute
169 170.57ms 184ns cuDeviceGetAttribute
170 170.57ms 183ns cuDeviceGetAttribute
171 170.58ms 199ns cuDeviceGetAttribute
172 170.58ms 189ns cuDeviceGetAttribute
173 170.58ms 191ns cuDeviceGetAttribute
174 170.58ms 213ns cuDeviceGetAttribute
175 170.58ms 113.31us cuDeviceGetAttribute
176 170.69ms 224ns cuDeviceGetAttribute
177 170.69ms 193ns cuDeviceGetAttribute
178 170.69ms 189ns cuDeviceGetAttribute
179 170.69ms 186ns cuDeviceGetAttribute
180 170.69ms 182ns cuDeviceGetAttribute
181 170.69ms 184ns cuDeviceGetAttribute
182 170.69ms 184ns cuDeviceGetAttribute
183 170.70ms 5.8260us cudaConfigureCall
184 170.71ms 182.62ms cudaLaunch (set(void) [178])
185 353.33ms 669ns cudaGetLastError
186 353.33ms 575ns cudaConfigureCall
187 353.33ms 10.142us cudaLaunch (ma4(void) [181])
188 353.34ms 250ns cudaGetLastError
189 353.34ms 40.134ms cudaDeviceSynchronize
190 393.48ms 451.45ms cudaMemcpyFromSymbol
191 844.93ms 442.59ms cudaMemcpyFromSymbol
192 1.28752s 441.20ms cudaMemcpyFromSymbol
193 1.72872s 440.79ms cudaMemcpyFromSymbol
1 $ nvprof --events all ./a.out
2 ==21569== NVPROF is profiling process 21569, command: ./a.out
3 ==21569== Warning: Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
4 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
5 ==21569== Profiling application: ./a.out
6 ==21569== Profiling result:
7 ==21569== Event result:
8 Invocations Event Name Min Max Avg
9 Device "Tesla K40c (0)"
10 Kernel: set(void)
11 1 tex0_cache_sector_queries 0 0 0
12 1 tex1_cache_sector_queries 0 0 0
13 1 tex2_cache_sector_queries 0 0 0
14 1 tex3_cache_sector_queries 0 0 0
15 1 tex0_cache_sector_misses 0 0 0
16 1 tex1_cache_sector_misses 0 0 0
17 1 tex2_cache_sector_misses 0 0 0
18 1 tex3_cache_sector_misses 0 0 0
19 1 fb_subp0_read_sectors 7380 7380 7380
20 1 fb_subp1_read_sectors 6804 6804 6804
21 1 fb_subp0_write_sectors 58485636 58485636 58485636
22 1 fb_subp1_write_sectors 58493371 58493371 58493371
23 1 l2_subp0_write_sector_misses 25165657 25165657 25165657
24 1 l2_subp1_write_sector_misses 25165274 25165274 25165274
25 1 l2_subp2_write_sector_misses 25165342 25165342 25165342
26 1 l2_subp3_write_sector_misses 25165522 25165522 25165522
27 1 l2_subp0_read_sector_misses 2681 2681 2681
28 1 l2_subp1_read_sector_misses 2867 2867 2867
29 1 l2_subp2_read_sector_misses 2473 2473 2473
30 1 l2_subp3_read_sector_misses 3758 3758 3758
31 1 l2_subp0_write_l1_sector_queries 25165824 25165824 25165824
32 1 l2_subp1_write_l1_sector_queries 25165824 25165824 25165824
33 1 l2_subp2_write_l1_sector_queries 25165824 25165824 25165824
34 1 l2_subp3_write_l1_sector_queries 25165824 25165824 25165824
35 1 l2_subp0_read_l1_sector_queries 0 0 0
36 1 l2_subp1_read_l1_sector_queries 0 0 0
37 1 l2_subp2_read_l1_sector_queries 0 0 0
38 1 l2_subp3_read_l1_sector_queries 0 0 0
39 1 l2_subp0_read_l1_hit_sectors 0 0 0
40 1 l2_subp1_read_l1_hit_sectors 0 0 0
41 1 l2_subp2_read_l1_hit_sectors 0 0 0
42 1 l2_subp3_read_l1_hit_sectors 0 0 0
43 1 l2_subp0_read_tex_sector_queries 0 0 0
44 1 l2_subp1_read_tex_sector_queries 0 0 0
45 1 l2_subp2_read_tex_sector_queries 0 0 0
46 1 l2_subp3_read_tex_sector_queries 0 0 0
47 1 l2_subp0_read_tex_hit_sectors 0 0 0
48 1 l2_subp1_read_tex_hit_sectors 0 0 0
49 1 l2_subp2_read_tex_hit_sectors 0 0 0
50 1 l2_subp3_read_tex_hit_sectors 0 0 0
51 1 rocache_subp0_gld_thread_count_32b 0 0 0
52 1 rocache_subp1_gld_thread_count_32b 0 0 0
53 1 rocache_subp2_gld_thread_count_32b 0 0 0
54 1 rocache_subp3_gld_thread_count_32b 0 0 0
55 1 rocache_subp0_gld_thread_count_64b 0 0 0
56 1 rocache_subp1_gld_thread_count_64b 0 0 0
57 1 rocache_subp2_gld_thread_count_64b 0 0 0
58 1 rocache_subp3_gld_thread_count_64b 0 0 0
59 1 rocache_subp0_gld_thread_count_128b 0 0 0
60 1 rocache_subp1_gld_thread_count_128b 0 0 0
61 1 rocache_subp2_gld_thread_count_128b 0 0 0
62 1 rocache_subp3_gld_thread_count_128b 0 0 0
63 1 rocache_subp0_gld_warp_count_32b 0 0 0
64 1 rocache_subp1_gld_warp_count_32b 0 0 0
65 1 rocache_subp2_gld_warp_count_32b 0 0 0
66 1 rocache_subp3_gld_warp_count_32b 0 0 0
67 1 rocache_subp0_gld_warp_count_64b 0 0 0
68 1 rocache_subp1_gld_warp_count_64b 0 0 0
69 1 rocache_subp2_gld_warp_count_64b 0 0 0
70 1 rocache_subp3_gld_warp_count_64b 0 0 0
71 1 rocache_subp0_gld_warp_count_128b 0 0 0
72 1 rocache_subp1_gld_warp_count_128b 0 0 0
73 1 rocache_subp2_gld_warp_count_128b 0 0 0
74 1 rocache_subp3_gld_warp_count_128b 0 0 0
75 1 l2_subp0_read_sysmem_sector_queries 0 0 0
76 1 l2_subp1_read_sysmem_sector_queries 0 0 0
77 1 l2_subp2_read_sysmem_sector_queries 0 0 0
78 1 l2_subp3_read_sysmem_sector_queries 0 0 0
79 1 l2_subp0_write_sysmem_sector_queries 1 1 1
80 1 l2_subp1_write_sysmem_sector_queries 0 0 0
81 1 l2_subp2_write_sysmem_sector_queries 0 0 0
82 1 l2_subp3_write_sysmem_sector_queries 0 0 0
83 1 l2_subp0_total_read_sector_queries 3792 3792 3792
84 1 l2_subp1_total_read_sector_queries 3547 3547 3547
85 1 l2_subp2_total_read_sector_queries 3825 3825 3825
86 1 l2_subp3_total_read_sector_queries 3779 3779 3779
87 1 l2_subp0_total_write_sector_queries 25165826 25165826 25165826
88 1 l2_subp1_total_write_sector_queries 25165830 25165830 25165830
89 1 l2_subp2_total_write_sector_queries 25165832 25165832 25165832
90 1 l2_subp3_total_write_sector_queries 25165829 25165829 25165829
91 1 elapsed_cycles_sm 180127824 180127824 180127824
92 1 gld_inst_8bit 0 0 0
93 1 gld_inst_16bit 0 0 0
94 1 gld_inst_32bit 0 0 0
95 1 gld_inst_64bit 0 0 0
96 1 gld_inst_128bit 0 0 0
97 1 gst_inst_8bit 0 0 0
98 1 gst_inst_16bit 0 0 0
99 1 gst_inst_32bit 805306368 805306368 805306368
100 1 gst_inst_64bit 0 0 0
101 1 gst_inst_128bit 0 0 0
102 1 rocache_gld_inst_8bit 0 0 0
103 1 rocache_gld_inst_16bit 0 0 0
104 1 rocache_gld_inst_32bit 0 0 0
105 1 rocache_gld_inst_64bit 0 0 0
106 1 rocache_gld_inst_128bit 0 0 0
107 1 prof_trigger_00 0 0 0
108 1 prof_trigger_01 0 0 0
109 1 prof_trigger_02 0 0 0
110 1 prof_trigger_03 0 0 0
111 1 prof_trigger_04 0 0 0
112 1 prof_trigger_05 0 0 0
113 1 prof_trigger_06 0 0 0
114 1 prof_trigger_07 0 0 0
115 1 atom_cas_count 0 0 0
116 1 warps_launched 8388608 8388608 8388608
117 1 threads_launched 268435456 268435456 268435456
118 1 inst_executed 192937984 192937984 192937984
119 1 inst_issued1 94822874 94822874 94822874
120 1 inst_issued2 74892129 74892129 74892129
121 1 thread_inst_executed 6174015488 6174015488 6174015488
122 1 shared_load 0 0 0
123 1 shared_store 0 0 0
124 1 local_load 0 0 0
125 1 local_store 0 0 0
126 1 gld_request 0 0 0
127 1 gst_request 25165824 25165824 25165824
128 1 active_cycles 180052030 180052030 180052030
129 1 active_warps 8973929732 8973929732 8973929732
130 1 sm_cta_launched 262144 262144 262144
131 1 not_predicated_off_thread_inst_executed 6174015488 6174015488 6174015488
132 1 l1_local_load_hit 0 0 0
133 1 l1_local_load_miss 0 0 0
134 1 l1_local_store_hit 0 0 0
135 1 l1_local_store_miss 0 0 0
136 1 l1_global_load_hit 0 0 0
137 1 l1_global_load_miss 0 0 0
138 1 uncached_global_load_transaction 0 0 0
139 1 global_store_transaction 25165824 25165824 25165824
140 1 shared_load_replay 0 0 0
141 1 shared_store_replay 0 0 0
142 1 global_ld_mem_divergence_replays 0 0 0
143 1 global_st_mem_divergence_replays 0 0 0
144 1 local_load_transactions 0 0 0
145 1 local_store_transactions 0 0 0
146 1 l1_shared_load_transactions 0 0 0
147 1 l1_shared_store_transactions 0 0 0
148 1 __l1_global_load_transactions 0 0 0
149 1 __l1_global_store_transactions 52166050 52166050 52166050
150 1 atom_count 0 0 0
151 1 gred_count 0 0 0
152 Kernel: ma4(void)
153 1 tex0_cache_sector_queries 0 0 0
154 1 tex1_cache_sector_queries 0 0 0
155 1 tex2_cache_sector_queries 0 0 0
156 1 tex3_cache_sector_queries 0 0 0
157 1 tex0_cache_sector_misses 0 0 0
158 1 tex1_cache_sector_misses 0 0 0
159 1 tex2_cache_sector_misses 0 0 0
160 1 tex3_cache_sector_misses 0 0 0
161 1 fb_subp0_read_sectors 59042830 59042830 59042830
162 1 fb_subp1_read_sectors 59048635 59048635 59048635
163 1 fb_subp0_write_sectors 19809272 19809272 19809272
164 1 fb_subp1_write_sectors 19812483 19812483 19812483
165 1 l2_subp0_write_sector_misses 8388611 8388611 8388611
166 1 l2_subp1_write_sector_misses 8388610 8388610 8388610
167 1 l2_subp2_write_sector_misses 8388612 8388612 8388612
168 1 l2_subp3_write_sector_misses 8388609 8388609 8388609
169 1 l2_subp0_read_sector_misses 25169862 25169862 25169862
170 1 l2_subp1_read_sector_misses 25170525 25170525 25170525
171 1 l2_subp2_read_sector_misses 25170366 25170366 25170366
172 1 l2_subp3_read_sector_misses 25170841 25170841 25170841
173 1 l2_subp0_write_l1_sector_queries 8388608 8388608 8388608
174 1 l2_subp1_write_l1_sector_queries 8388608 8388608 8388608
175 1 l2_subp2_write_l1_sector_queries 8388608 8388608 8388608
176 1 l2_subp3_write_l1_sector_queries 8388608 8388608 8388608
177 1 l2_subp0_read_l1_sector_queries 25165824 25165824 25165824
178 1 l2_subp1_read_l1_sector_queries 25165824 25165824 25165824
179 1 l2_subp2_read_l1_sector_queries 25165824 25165824 25165824
180 1 l2_subp3_read_l1_sector_queries 25165824 25165824 25165824
181 1 l2_subp0_read_l1_hit_sectors 0 0 0
182 1 l2_subp1_read_l1_hit_sectors 0 0 0
183 1 l2_subp2_read_l1_hit_sectors 0 0 0
184 1 l2_subp3_read_l1_hit_sectors 0 0 0
185 1 l2_subp0_read_tex_sector_queries 0 0 0
186 1 l2_subp1_read_tex_sector_queries 0 0 0
187 1 l2_subp2_read_tex_sector_queries 0 0 0
188 1 l2_subp3_read_tex_sector_queries 0 0 0
189 1 l2_subp0_read_tex_hit_sectors 0 0 0
190 1 l2_subp1_read_tex_hit_sectors 0 0 0
191 1 l2_subp2_read_tex_hit_sectors 0 0 0
192 1 l2_subp3_read_tex_hit_sectors 0 0 0
193 1 rocache_subp0_gld_thread_count_32b 0 0 0
194 1 rocache_subp1_gld_thread_count_32b 0 0 0
195 1 rocache_subp2_gld_thread_count_32b 0 0 0
196 1 rocache_subp3_gld_thread_count_32b 0 0 0
197 1 rocache_subp0_gld_thread_count_64b 0 0 0
198 1 rocache_subp1_gld_thread_count_64b 0 0 0
199 1 rocache_subp2_gld_thread_count_64b 0 0 0
200 1 rocache_subp3_gld_thread_count_64b 0 0 0
201 1 rocache_subp0_gld_thread_count_128b 0 0 0
202 1 rocache_subp1_gld_thread_count_128b 0 0 0
203 1 rocache_subp2_gld_thread_count_128b 0 0 0
204 1 rocache_subp3_gld_thread_count_128b 0 0 0
205 1 rocache_subp0_gld_warp_count_32b 0 0 0
206 1 rocache_subp1_gld_warp_count_32b 0 0 0
207 1 rocache_subp2_gld_warp_count_32b 0 0 0
208 1 rocache_subp3_gld_warp_count_32b 0 0 0
209 1 rocache_subp0_gld_warp_count_64b 0 0 0
210 1 rocache_subp1_gld_warp_count_64b 0 0 0
211 1 rocache_subp2_gld_warp_count_64b 0 0 0
212 1 rocache_subp3_gld_warp_count_64b 0 0 0
213 1 rocache_subp0_gld_warp_count_128b 0 0 0
214 1 rocache_subp1_gld_warp_count_128b 0 0 0
215 1 rocache_subp2_gld_warp_count_128b 0 0 0
216 1 rocache_subp3_gld_warp_count_128b 0 0 0
217 1 l2_subp0_read_sysmem_sector_queries 0 0 0
218 1 l2_subp1_read_sysmem_sector_queries 0 0 0
219 1 l2_subp2_read_sysmem_sector_queries 0 0 0
220 1 l2_subp3_read_sysmem_sector_queries 0 0 0
221 1 l2_subp0_write_sysmem_sector_queries 0 0 0
222 1 l2_subp1_write_sysmem_sector_queries 1 1 1
223 1 l2_subp2_write_sysmem_sector_queries 4 4 4
224 1 l2_subp3_write_sysmem_sector_queries 0 0 0
225 1 l2_subp0_total_read_sector_queries 25171411 25171411 25171411
226 1 l2_subp1_total_read_sector_queries 25171516 25171516 25171516
227 1 l2_subp2_total_read_sector_queries 25169937 25169937 25169937
228 1 l2_subp3_total_read_sector_queries 25170465 25170465 25170465
229 1 l2_subp0_total_write_sector_queries 8388610 8388610 8388610
230 1 l2_subp1_total_write_sector_queries 8388609 8388609 8388609
231 1 l2_subp2_total_write_sector_queries 8388612 8388612 8388612
232 1 l2_subp3_total_write_sector_queries 8388610 8388610 8388610
233 1 elapsed_cycles_sm 268985103 268985103 268985103
234 1 gld_inst_8bit 0 0 0
235 1 gld_inst_16bit 0 0 0
236 1 gld_inst_32bit 805306368 805306368 805306368
237 1 gld_inst_64bit 0 0 0
238 1 gld_inst_128bit 0 0 0
239 1 gst_inst_8bit 0 0 0
240 1 gst_inst_16bit 0 0 0
241 1 gst_inst_32bit 268435456 268435456 268435456
242 1 gst_inst_64bit 0 0 0
243 1 gst_inst_128bit 0 0 0
244 1 rocache_gld_inst_8bit 0 0 0
245 1 rocache_gld_inst_16bit 0 0 0
246 1 rocache_gld_inst_32bit 0 0 0
247 1 rocache_gld_inst_64bit 0 0 0
248 1 rocache_gld_inst_128bit 0 0 0
249 1 prof_trigger_00 0 0 0
250 1 prof_trigger_01 0 0 0
251 1 prof_trigger_02 0 0 0
252 1 prof_trigger_03 0 0 0
253 1 prof_trigger_04 0 0 0
254 1 prof_trigger_05 0 0 0
255 1 prof_trigger_06 0 0 0
256 1 prof_trigger_07 0 0 0
257 1 atom_cas_count 0 0 0
258 1 warps_launched 8388608 8388608 8388608
259 1 threads_launched 268435456 268435456 268435456
260 1 inst_executed 218103808 218103808 218103808
261 1 inst_issued1 107961320 107961320 107961320
262 1 inst_issued2 89349093 89349093 89349093
263 1 thread_inst_executed 6979321856 6979321856 6979321856
264 1 shared_load 0 0 0
265 1 shared_store 0 0 0
266 1 local_load 0 0 0
267 1 local_store 0 0 0
268 1 gld_request 25165824 25165824 25165824
269 1 gst_request 8388608 8388608 8388608
270 1 active_cycles 268843820 268843820 268843820
271 1 active_warps 1.3604e+10 1.3604e+10 1.3604e+10
272 1 sm_cta_launched 262144 262144 262144
273 1 not_predicated_off_thread_inst_executed 6979321856 6979321856 6979321856
274 1 l1_local_load_hit 0 0 0
275 1 l1_local_load_miss 0 0 0
276 1 l1_local_store_hit 0 0 0
277 1 l1_local_store_miss 0 0 0
278 1 l1_global_load_hit 0 0 0
279 1 l1_global_load_miss 0 0 0
280 1 uncached_global_load_transaction 25165824 25165824 25165824
281 1 global_store_transaction 8388608 8388608 8388608
282 1 shared_load_replay 0 0 0
283 1 shared_store_replay 0 0 0
284 1 global_ld_mem_divergence_replays 0 0 0
285 1 global_st_mem_divergence_replays 0 0 0
286 1 local_load_transactions 0 0 0
287 1 local_store_transactions 0 0 0
288 1 l1_shared_load_transactions 0 0 0
289 1 l1_shared_store_transactions 0 0 0
290 1 __l1_global_load_transactions 39559910 39559910 39559910
291 1 __l1_global_store_transactions 8509939 8509939 8509939
292 1 atom_count 0 0 0
293 1 gred_count 0 0 0
1 $ nvprof --metrics all ./a.out
2 ==18308== NVPROF is profiling process 18308, command: ./a.out
3 ==18308== Warning: Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
4 16797695, 16797696.000000!=16403.000000*1023.000000+17426.000000
5 ==18308== Profiling application: ./a.out
6 ==18308== Profiling result:
7 ==18308== Metric result:
8 Invocations Metric Name Metric Description Min Max Avg
9 Device "Tesla K40c (0)"
10 Kernel: set(void)
11 1 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00%
12 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00%
13 1 sm_efficiency Multiprocessor Activity 99.95% 99.95% 99.95%
14 1 ipc Executed IPC 1.071637 1.071637 1.071637
15 1 achieved_occupancy Achieved Occupancy 0.778590 0.778590 0.778590
16 1 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
17 1 gst_requested_throughput Requested Global Store Throughput 202.65GB/s 202.65GB/s 202.65GB/s
18 1 sm_efficiency_instance Multiprocessor Activity 99.95% 99.95% 99.95%
19 1 ipc_instance Executed IPC 1.071637 1.071637 1.071637
20 1 inst_replay_overhead Instruction Replay Overhead 0.267629 0.267629 0.267629
21 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000
22 1 global_replay_overhead Global Memory Replay Overhead 0.000000 0.000000 0.000000
23 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.000000 0.000000 0.000000
24 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00%
25 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s
26 1 dram_read_throughput Device Memory Read Throughput 29.596MB/s 29.596MB/s 29.596MB/s
27 1 dram_write_throughput Device Memory Write Throughput 235.57GB/s 235.57GB/s 235.57GB/s
28 1 gst_throughput Global Store Throughput 202.65GB/s 202.65GB/s 202.65GB/s
29 1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
30 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000
31 1 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
32 1 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
33 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
34 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00%
35 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
36 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 0.00000B/s 0.00000B/s 0.00000B/s
37 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s
38 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00%
39 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00%
40 1 nc_gld_requested_throughput Requested Non-Coherent Global Load Throu 0.00000B/s 0.00000B/s 0.00000B/s
41 1 issued_ipc Issued IPC 1.358438 1.358438 1.358438
42 1 inst_per_warp Instructions per warp 23.000000 23.000000 23.000000
43 1 issue_slot_utilization Issue Slot Utilization 23.56% 23.56% 23.56%
44 1 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000
45 1 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000
46 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.000000 0.000000 0.000000
47 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.000000 0.000000 0.000000
48 1 gld_transactions_per_request Global Load Transactions Per Request 0.000000 0.000000 0.000000
49 1 gst_transactions_per_request Global Store Transactions Per Request 1.000000 1.000000 1.000000
50 1 local_load_transactions Local Load Transactions 0 0 0
51 1 local_store_transactions Local Store Transactions 0 0 0
52 1 shared_load_transactions Shared Load Transactions 0 0 0
53 1 shared_store_transactions Shared Store Transactions 0 0 0
54 1 gld_transactions Global Load Transactions 0 0 0
55 1 gst_transactions Global Store Transactions 25165824 25165824 25165824
56 1 sysmem_read_transactions System Memory Read Transactions 0 0 0
57 1 sysmem_write_transactions System Memory Write Transactions 6 6 6
58 1 tex_cache_transactions Texture Cache Transactions 0 0 0
59 1 dram_read_transactions Device Memory Read Transactions 14701 14701 14701
60 1 dram_write_transactions Device Memory Write Transactions 117011651 117011651 117011651
61 1 l2_read_transactions L2 Read Transactions 14666 14666 14666
62 1 l2_write_transactions L2 Write Transactions 100663312 100663312 100663312
63 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
64 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
65 1 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
66 1 shared_store_throughput Shared Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
67 1 l2_read_throughput L2 Throughput (Reads) 29.525MB/s 29.525MB/s 29.525MB/s
68 1 l2_write_throughput L2 Throughput (Writes) 202.65GB/s 202.65GB/s 202.65GB/s
69 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s
70 1 sysmem_write_throughput System Memory Write Throughput 12.079KB/s 12.079KB/s 12.079KB/s
71 1 warp_nonpred_execution_efficiency Warp Non-Predicated Execution Efficiency 100.00% 100.00% 100.00%
72 1 cf_issued Issued Control-Flow Instructions 15157795 15157795 15157795
73 1 cf_executed Executed Control-Flow Instructions 8388608 8388608 8388608
74 1 ldst_issued Issued Load/Store Instructions 59992605 59992605 59992605
75 1 ldst_executed Executed Load/Store Instructions 25165824 25165824 25165824
76 1 flops_sp FLOPS(Single) 268435456 268435456 268435456
77 1 flops_sp_add FLOPS(Single Add) 268435456 268435456 268435456
78 1 flops_sp_mul FLOPS(Single Mul) 0 0 0
79 1 flops_sp_fma FLOPS(Single FMA) 0 0 0
80 1 flops_dp FLOPS(Double) 0 0 0
81 1 flops_dp_add FLOPS(Double Add) 0 0 0
82 1 flops_dp_mul FLOPS(Double Mul) 0 0 0
83 1 flops_dp_fma FLOPS(Double FMA) 0 0 0
84 1 flops_sp_special FLOPS(Single Special) 0 0 0
85 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 1.53% 1.53% 1.53%
86 1 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 14.11% 14.11% 14.11%
87 1 stall_data_request Issue Stall Reasons (Data Request) 56.32% 56.32% 56.32%
88 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00%
89 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00%
90 1 stall_other Issue Stall Reasons (Other) 20.20% 20.20% 20.20%
91 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1)
92 1 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4)
93 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0)
94 1 dram_utilization Device Memory Utilization High (9) High (9) High (9)
95 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1)
96 1 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4)
97 1 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (5) Mid (5)
98 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1)
99 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0)
100 1 inst_executed Instructions Executed 192937984 192937984 192937984
101 1 inst_issued Instructions Issued 244573713 244573713 244573713
102 1 issue_slots Issue Slots 169693573 169693573 169693573
103 1 nc_l2_read_throughput L2 Throughput (Non-Coherent Reads) 0.00000B/s 0.00000B/s 0.00000B/s
104 1 nc_l2_read_transactions L2 Non-Coherent Read Transactions 0 0 0
105 1 nc_cache_global_hit_rate Non-Coherent Global Hit Rate 0.00% 0.00% 0.00%
106 1 nc_gld_throughput Non-Coherent Global Memory Load Throughp 0.00000B/s 0.00000B/s 0.00000B/s
107 1 nc_gld_efficiency Non-Coherent Global Load Efficiency 0.00% 0.00% 0.00%
108 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s
109 1 inst_fp_32 FP Instructions(Single) 268435456 268435456 268435456
110 1 inst_fp_64 FP Instructions(Double) 0 0 0
111 1 inst_integer Integer Instructions 1879048192 1879048192 1879048192
112 1 inst_bit_convert Bit-Convert Instructions 536870912 536870912 536870912
113 1 inst_control Control-Flow Instructions 268435456 268435456 268435456
114 1 inst_compute_ld_st Load/Store Instructions 805306368 805306368 805306368
115 1 inst_misc Misc Instructions 2415919104 2415919104 2415919104
116 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0
117 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000
118 1 atomic_transactions Atomic Transactions 0 0 0
119 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000
120 Kernel: ma4(void)
121 1 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00%
122 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00%
123 1 sm_efficiency Multiprocessor Activity 99.95% 99.95% 99.95%
124 1 ipc Executed IPC 0.814540 0.814540 0.814540
125 1 achieved_occupancy Achieved Occupancy 0.790154 0.790154 0.790154
126 1 gld_requested_throughput Requested Global Load Throughput 135.78GB/s 135.78GB/s 135.78GB/s
127 1 gst_requested_throughput Requested Global Store Throughput 45.259GB/s 45.259GB/s 45.259GB/s
128 1 sm_efficiency_instance Multiprocessor Activity 99.95% 99.95% 99.95%
129 1 ipc_instance Executed IPC 0.814540 0.814540 0.814540
130 1 inst_replay_overhead Instruction Replay Overhead 0.314307 0.314307 0.314307
131 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000
132 1 global_replay_overhead Global Memory Replay Overhead 0.000000 0.000000 0.000000
133 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.000000 0.000000 0.000000
134 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00%
135 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s
136 1 dram_read_throughput Device Memory Read Throughput 159.31GB/s 159.31GB/s 159.31GB/s
137 1 dram_write_throughput Device Memory Write Throughput 53.441GB/s 53.441GB/s 53.441GB/s
138 1 gst_throughput Global Store Throughput 45.259GB/s 45.259GB/s 45.259GB/s
139 1 gld_throughput Global Load Throughput 135.78GB/s 135.78GB/s 135.78GB/s
140 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000
141 1 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
142 1 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
143 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
144 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00%
145 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00%
146 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 135.78GB/s 135.78GB/s 135.78GB/s
147 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s
148 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00%
149 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00%
150 1 nc_gld_requested_throughput Requested Non-Coherent Global Load Throu 0.00000B/s 0.00000B/s 0.00000B/s
151 1 issued_ipc Issued IPC 1.070556 1.070556 1.070556
152 1 inst_per_warp Instructions per warp 26.000000 26.000000 26.000000
153 1 issue_slot_utilization Issue Slot Utilization 18.42% 18.42% 18.42%
154 1 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000
155 1 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000
156 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.000000 0.000000 0.000000
157 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.000000 0.000000 0.000000
158 1 gld_transactions_per_request Global Load Transactions Per Request 1.000000 1.000000 1.000000
159 1 gst_transactions_per_request Global Store Transactions Per Request 1.000000 1.000000 1.000000
160 1 local_load_transactions Local Load Transactions 0 0 0
161 1 local_store_transactions Local Store Transactions 0 0 0
162 1 shared_load_transactions Shared Load Transactions 0 0 0
163 1 shared_store_transactions Shared Store Transactions 0 0 0
164 1 gld_transactions Global Load Transactions 25165824 25165824 25165824
165 1 gst_transactions Global Store Transactions 8388608 8388608 8388608
166 1 sysmem_read_transactions System Memory Read Transactions 0 0 0
167 1 sysmem_write_transactions System Memory Write Transactions 0 0 0
168 1 tex_cache_transactions Texture Cache Transactions 0 0 0
169 1 dram_read_transactions Device Memory Read Transactions 118109420 118109420 118109420
170 1 dram_write_transactions Device Memory Write Transactions 39619957 39619957 39619957
171 1 l2_read_transactions L2 Read Transactions 100681792 100681792 100681792
172 1 l2_write_transactions L2 Write Transactions 33554441 33554441 33554441
173 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
174 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
175 1 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
176 1 shared_store_throughput Shared Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s
177 1 l2_read_throughput L2 Throughput (Reads) 135.80GB/s 135.80GB/s 135.80GB/s
178 1 l2_write_throughput L2 Throughput (Writes) 45.259GB/s 45.259GB/s 45.259GB/s
179 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s
180 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s
181 1 warp_nonpred_execution_efficiency Warp Non-Predicated Execution Efficiency 100.00% 100.00% 100.00%
182 1 cf_issued Issued Control-Flow Instructions 22238969 22238969 22238969
183 1 cf_executed Executed Control-Flow Instructions 8388608 8388608 8388608
184 1 ldst_issued Issued Load/Store Instructions 61821484 61821484 61821484
185 1 ldst_executed Executed Load/Store Instructions 33554432 33554432 33554432
186 1 flops_sp FLOPS(Single) 536870912 536870912 536870912
187 1 flops_sp_add FLOPS(Single Add) 0 0 0
188 1 flops_sp_mul FLOPS(Single Mul) 0 0 0
189 1 flops_sp_fma FLOPS(Single FMA) 268435456 268435456 268435456
190 1 flops_dp FLOPS(Double) 0 0 0
191 1 flops_dp_add FLOPS(Double Add) 0 0 0
192 1 flops_dp_mul FLOPS(Double Mul) 0 0 0
193 1 flops_dp_fma FLOPS(Double FMA) 0 0 0
194 1 flops_sp_special FLOPS(Single Special) 0 0 0
195 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 2.23% 2.23% 2.23%
196 1 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 69.67% 69.67% 69.67%
197 1 stall_data_request Issue Stall Reasons (Data Request) 10.15% 10.15% 10.15%
198 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00%
199 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00%
200 1 stall_other Issue Stall Reasons (Other) 12.57% 12.57% 12.57%
201 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1)
202 1 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4)
203 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0)
204 1 dram_utilization Device Memory Utilization High (8) High (8) High (8)
205 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0)
206 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3)
207 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3)
208 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1)
209 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0)
210 1 inst_executed Instructions Executed 218103808 218103808 218103808
211 1 inst_issued Instructions Issued 286655297 286655297 286655297
212 1 issue_slots Issue Slots 197295405 197295405 197295405
213 1 nc_l2_read_throughput L2 Throughput (Non-Coherent Reads) 0.00000B/s 0.00000B/s 0.00000B/s
214 1 nc_l2_read_transactions L2 Non-Coherent Read Transactions 0 0 0
215 1 nc_cache_global_hit_rate Non-Coherent Global Hit Rate 0.00% 0.00% 0.00%
216 1 nc_gld_throughput Non-Coherent Global Memory Load Throughp 0.00000B/s 0.00000B/s 0.00000B/s
217 1 nc_gld_efficiency Non-Coherent Global Load Efficiency 0.00% 0.00% 0.00%
218 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s
219 1 inst_fp_32 FP Instructions(Single) 268435456 268435456 268435456
220 1 inst_fp_64 FP Instructions(Double) 0 0 0
221 1 inst_integer Integer Instructions 2415919104 2415919104 2415919104
222 1 inst_bit_convert Bit-Convert Instructions 0 0 0
223 1 inst_control Control-Flow Instructions 268435456 268435456 268435456
224 1 inst_compute_ld_st Load/Store Instructions 1073741824 1073741824 1073741824
225 1 inst_misc Misc Instructions 2952790016 2952790016 2952790016
226 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0
227 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000
228 1 atomic_transactions Atomic Transactions 0 0 0
229 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000
checkCudaErrors()
.getLastCudaError()
.blockDim.x*blockDim.y*blockDim.z<=1024
, etc.cuda-memcheck
--tool memcheck
--tool racecheck
nvprof
para ver que todos los kernels esté ejecutando.sgemm
N×N hilos, cada uno c[i][j] = a[i][]*b[][j]
, 2N
FLOPS, 2N
Memoria.
(NVIDIA, NVIDIA CUDA C Programming Guide)
N×N
definidas en runtime.BX×BY
definidos en runtime.N
múltiplo de BX
o BY
.Kernel
1 // 2D to 1D bijection
2 #define IX(i,j) ((i)*(N)+(j))
3
4 __global__ void sgemm(const uint N, float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
5 const uint i = blockIdx.y*blockDim.y + threadIdx.y;
6 const uint j = blockIdx.x*blockDim.x + threadIdx.x;
7 if (i<N && j<N)
8 for (uint k=0; k<N; ++k)
9 c[IX(i,j)] += a[IX(i,k)] * b[IX(k,j)];
10 }
Notar:
» Uso de __restrict__
.
» blockDim.{x,y}
para conocer el tamaño del bloque en runtime.
» Tipos de CUDA: uint
.
1 checkCudaErrors(cudaMalloc(&d_a, SIZE * sizeof(float)));
2 checkCudaErrors(cudaMalloc(&d_b, SIZE * sizeof(float)));
3 checkCudaErrors(cudaMalloc(&d_c, SIZE * sizeof(float)));
4 h_a = (float *) calloc(SIZE, sizeof(float));
5 h_b = (float *) calloc(SIZE, sizeof(float));
6 h_c = (float *) calloc(SIZE, sizeof(float));
7 assert(d_a && d_b && d_c && h_a && h_b && h_c);
1 // integer ceiling division
2 #define DIV_CEIL(a,b) (((a)+(b)-1)/(b))
3 dim3 grid_size(DIV_CEIL(N,BX), DIV_CEIL(N,BY));
4 dim3 block_size(BX,BY);
5 setmm<<<grid_size, block_size>>>(N, d_a, d_b, d_c);
6 getLastCudaError("setmm kernel failed");
7 sgemm<<<grid_size, block_size>>>(N, d_a, d_b, d_c);
8 getLastCudaError("sgemm kernel failed");
9 cudaDeviceSynchronize();
Notar:
» Uso de DIV_CEIL
para que haya suficientes bloques.
» Revisión de errores.
1 checkCudaErrors(cudaMemcpy(h_a, d_a, SIZE*sizeof(float), cudaMemcpyDefault));
2 checkCudaErrors(cudaMemcpy(h_b, d_b, SIZE*sizeof(float), cudaMemcpyDefault));
3 checkCudaErrors(cudaMemcpy(h_c, d_c, SIZE*sizeof(float), cudaMemcpyDefault));
4 double max_diff = 0.0;
5 for (unsigned int i=0; i<N; ++i) {
6 for (unsigned int j=0; j<N; ++j) {
7 float cij = 0.0f;
8 for (unsigned int k=0; k<N; ++k)
9 cij += h_a[IX(i,k)] * h_b[IX(k,j)];
10 max_diff = MAX(max_diff, abs(cij-h_c[IX(i,j)]));
11 }
12 }
13 printf("max_diff: %f\n", max_diff);
Notar:
1 $ nvprof ./sgemm 256 16 16
2 ==21650== NVPROF is profiling process 21650, command: ./sgemm 256 16 16
3 max_diff: 0.000023
4 ==21650== Profiling application: ./sgemm 256 16 16
5 ==21650== Profiling result:
6 Time(%) Time Calls Avg Min Max Name
7 67.35% 283.52us 1 283.52us 283.52us 283.52us sgemm(unsigned int, float*, float*, float*)
8 30.79% 129.63us 3 43.211us 42.913us 43.392us [CUDA memcpy DtoH]
9 1.86% 7.8400us 1 7.8400us 7.8400us 7.8400us setmm(unsigned int, float*, float*, float*)
10 $ cuda-memcheck ./sgemm 256 16 16
11 ========= CUDA-MEMCHECK
12 max_diff: 0.000023
13 ========= ERROR SUMMARY: 0 errors
14 $ cuda-memcheck --tool racecheck ./sgemm 256 16 16
15 ========= CUDA-MEMCHECK
16 max_diff: 0.000023
17 ========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
Voy con pies de plomo
sgemm N BX BY
1 $ ./sgemm 1024 64 64
2 Cuda error: setmm kernel failed in file 'sgemm.cu' in line 54 : invalid configuration argument.
3 $ ./sgemm 1024 32 32
4 max_diff: 0.000092
5 $ ./sgemm 512 32 32
6 max_diff: 0.000046
7 $ ./sgemm 2048 32 32
8 max_diff: 0.000183
9 $ ./sgemm 256 1 1
10 max_diff: 0.000023
cuda-memcheck
.2048 32 32
la comprobación CPU es costosísima.1 $ nvcc -arch=sm_35 sgemm.cu -ptx
sgemm
1 BB3_2:
2 mul.wide.u32 %rd9, %r22, 4
3 add.s64 %rd10, %rd2, %rd9
4 mul.wide.u32 %rd11, %r21, 4
5 add.s64 %rd12, %rd1, %rd11
6 ld.global.nc.f32 %f4, [%rd12]
7 ld.global.nc.f32 %f5, [%rd10]
8 fma.rn.f32 %f6, %f5, %f4, %f6
9 add.s32 %r22, %r22, 1
10 add.s32 %r21, %r21, %r17
11 add.s32 %r23, %r23, 1
12 setp.lt.u32 %p6, %r23, %r17
13 @%p6 bra BB3_2
También con:
1 $cuobjdump -ptx sgemm
1 $ cuobjdump -sass sgemm
Vemos el assembler
1 /*01c8*/ MOV32I R10, 0x4; /* 0x74000000021fc02a */
2 /*01d0*/ IMAD.U32.U32 R6.CC, R12, R10, c[0x0][0x148]; /* 0x90042800291c301a */
3 /*01d8*/ IADD R13, R13, 0x1; /* 0xc0800000009c3435 */
4 /*01e0*/ IMAD.U32.U32.HI.X R7, R12, R10, c[0x0][0x14c]; /* 0x92102800299c301e */
5 /*01e8*/ ISETP.LT.U32.AND P0, PT, R13, c[0x0][0x140], PT; /* 0x5b101c00281c341e */
6 /*01f0*/ IMAD.U32.U32 R8.CC, R5, R10, c[0x0][0x150]; /* 0x900428002a1c1422 */
7 /*01f8*/ LDG.E R7, [R6]; /* 0x600210847f9c181d */
8 /* 0x08b813088010a010 */
9 /*0208*/ IMAD.U32.U32.HI.X R9, R5, R10, c[0x0][0x154]; /* 0x921028002a9c1426 */
10 /*0210*/ IADD R12, R12, 0x1; /* 0xc0800000009c3031 */
11 /*0218*/ IADD R5, R5, c[0x0][0x140]; /* 0x60800000281c1416 */
12 /*0220*/ LDG.E R4, [R8]; /* 0x600210847f9c2011 */
13 /*0228*/ TEXDEPBAR 0x0; /* 0x77000000001c0002 */
14 /*0230*/ FFMA R0, R7, R4, R0; /* 0xcc000000021c1c02 */
15 /*0238*/ @P0 BRA 0x1d0; /* 0x12007fffc800003c */
-ptxas-options=-v
1 $ nvcc sgemm.cu -arch=sm_35 --ptxas-options=-v -o sgemm.o
2 ptxas info : 0 bytes gmem, 24 bytes cmem[3]
3 ptxas info : Compiling entry function '_Z5setmmjPfS_S_' for 'sm_35'
4 ptxas info : Function properties for _Z5setmmjPfS_S_
5 32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
6 ptxas info : Used 15 registers, 352 bytes cmem[0], 48 bytes cmem[2]
7 ptxas info : Compiling entry function '_Z5sgemmjPfS_S_' for 'sm_35'
8 ptxas info : Function properties for _Z5sgemmjPfS_S_
9 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
10 ptxas info : Used 18 registers, 352 bytes cmem[0]
1 $ nvprof --print-gpu-trace ./sgemm 1024 32 32
2 ==5685== NVPROF is profiling process 5685, command: ./sgemm 1024 32 32
3 max_diff: 0.000092
4 ==5685== Profiling application: ./sgemm 1024 32 32
5 ==5685== Profiling result:
6 Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
7 327.46ms 203.14us (32 32 1) (32 32 1) 15 0B 0B - - Tesla K40c (0) 1 7 setmm(unsigned int, float*, float*, float*) [185]
8 327.67ms 16.545ms (32 32 1) (32 32 1) 18 0B 0B - - Tesla K40c (0) 1 7 sgemm(unsigned int, float*, float*, float*) [192]
Ambas informan 15 y 18 registros respectivamente.
1 $ nvprof --metrics ipc,flops_sp,gld_throughput,gst_throughput ./sgemm 1024 32 32
2 ==9609== NVPROF is profiling process 9609, command: ./sgemm 1024 32 32
3 ==9609== Warning: Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
4 max_diff: 0.000092
5 ==9609== Profiling application: ./sgemm 1024 32 32
6 ==9609== Profiling result:
7 ==9609== Metric result:
8 Invocations Metric Name Metric Description Min Max Avg
9 Device "Tesla K40c (0)"
10 Kernel: sgemm(unsigned int, float*, float*, float*)
11 1 ipc Executed IPC 2.102029 2.102029 2.102029
12 1 gst_throughput Global Store Throughput 253.94MB/s 253.94MB/s 253.94MB/s
13 1 gld_throughput Global Load Throughput 253.94MB/s 253.94MB/s 253.94MB/s
14 1 flops_sp FLOPS(Single) 2147483648 2147483648 2147483648
15 Kernel: setmm(unsigned int, float*, float*, float*)
16 1 ipc Executed IPC 2.414024 2.414024 2.414024
17 1 gst_throughput Global Store Throughput 61.940GB/s 61.940GB/s 61.940GB/s
18 1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
19 1 flops_sp FLOPS(Single) 39836070 39836070 39836070
» Buen uso del BW de memoria: 61 GiB/s sobre un total de ~200 GiB/s.
» Pésimo uso de la potencia de cálculo: 2.14 GFLOPS en sp! (tiene 4 TFLOPS sp).
¿Quiénes somos?: sgeeeeemmm!!!
¿Como es nuestra intensidad aritmética?: lineal al tamaño del problema!!!
¿Como estamos funcionando?: pésimooooo!
gputime
vs. BX
,BY
(C2070, 2012)gputime
en µs
gputime
vs. BX
,BY
(C2070, 2012, zoom)gputime
en µs
BX
=16,24,32.gputime
= 33.264ms para 2×1024^3 FLOP: 60 GFLOPS>>> ((2*1024**3)/0.033) / (1<<30)
60.6060606060606
1 $ grep -v "^ " sgemm-c2070.dat | sort -n -k 3 | head -16
2 32 5 33264.832
3 32 6 33275.199
4 32 24 33732.734
5 ...
6 16 10 35538.559
7 32 3 35789.473
8 32 8 35800.672
a
se lee y la c
se escribe de a warps de 32 hilos accediendo a 128 bytes consecutivos de la memoria.set logscale zcb 2
El top-16
1 $ grep -v "^ " sgemm-k40.dat | sort -n -k 3 | head -16
2 32 7 15819.616
3 16 14 15915.968
4 16 16 16027.520
5 32 8 16068.768
6 32 16 16092.128
7 8 32 16112.224
8 16 18 16115.968
9 32 9 16156.960
10 4 32 16188.736
11 9 32 16195.232
12 18 16 16196.064
13 8 28 16201.536
14 7 32 16203.808
15 16 8 16228.736
16 8 16 16229.504
17 17 15 16255.296
logscale
aprovechar bien la paleta.warpsize/2
, warpsize
).Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |