Nicolás Wolovick, $Date: 2012-06-06 18:53:09 -0300 (Wed, 06 Jun 2012) $, $Revision: 3586 $
fma0.cu
) 1 #include <cuda.h>
2
3 #define N (1<<25)
4 #define BLOCK_SIZE 128
5
6 __device__ float a[N], b[N], c[N], d[N];
7
8 __global__ void fma4(void) {
9 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
10 d[tid] = a[tid]*b[tid]+c[tid];
11 }
12
13 int main(void)
14 {
15 fma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
16 cudaThreadSynchronize();
17 return 0;
18 }
De 10 líneas de código (4 realmente útiles) y muchos errores.
Usaremos las herramientas disponibles y plantearemos buenas prácticas.
1 $ make
2 nvcc -g -G -arch=sm_20 --ptxas-options=-v -I/opt/cudasdk/C/common/inc -o fma0.o -c fma0.cu
3 $ ./fma0
4 $
Silently returned
fma1.cu
) 1 #include <cuda.h>
2 #include <stdio.h>
3
4 #define N (1<<25)
5 #define BLOCK_SIZE 128
6
7 __device__ float a[N], b[N], c[N], d[N];
8 float ha[N]={0.0f}, hb[N]={0.0f}, hc[N]={0.0f}, hd[N]={0.0f};
9
10 __global__ void fma4(void) {
11 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
12 d[tid] = a[tid]*b[tid]+c[tid];
13 }
14
15 int main(void)
16 {
17 fma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
18 cudaDeviceSynchronize();
19 cudaMemcpy(ha, a, N*sizeof(float), cudaMemcpyDefault);
20 cudaMemcpy(hb, b, N*sizeof(float), cudaMemcpyDefault);
21 cudaMemcpy(hc, c, N*sizeof(float), cudaMemcpyDefault);
22 cudaMemcpy(hd, d, N*sizeof(float), cudaMemcpyDefault);
23 for (unsigned int i=0; i<N; ++i)
24 if (hd[i] != ha[i]*hb[i]+hc[i]) {
25 printf("%d, %f!=%f*%f+%f\n", i, hd[i],ha[i],hb[i],hc[i]);
26 break;
27 }
28
29 return 0;
30 }
fma1.cu
1 $ make
2 nvcc -g -G -arch=sm_20 --ptxas-options=-v -I/opt/cudasdk/C/common/inc -o fma1.o -c fma1.cu
3 nvcc -g -G -arch=sm_20 --ptxas-options=-v -I/opt/cudasdk/C/common/inc -o fma1 fma1.o
4 $ ./fma1
5 $
Veamos ahora los kernels que lanza y toda la bola con el profiling built-in de CUDA.
Medir hardware y software counters.
1 export CUDA_PROFILE=1
2 export CUDA_PROFILE_CONFIG=profile.config
· branch
: cantidad de saltos (por SM).
· divergent_branch
: cantidad de saltos divergentes (por SM).
· instructions
: por SM.
· gld_{32,64,128}b
: cargas de de memoria global de tamaño 32, 64 y 128 bytes.
· cta_launched
: cantidad de bloques lanzados.
· sm_cta_launched
: idem, pero dentro de un SM.
· l1_global_load_hit
...
· l1_global_load_miss
...
· gpustarttimestamp
· gpuendtimestamp
· gridsize
· threadblocksize
· dynsmemperblock
· stasmemperblock
· regperthread
· memtransferdir
· memtransfersize
· memtransferhostmemtype
· streamid
fma1.cu
1 $ export CUDA_PROFILE=1
2 $ export CUDA_PROFILE_CONFIG=profile_debug_kernel
3 $ cat profile_debug_kernel
4 gridsize
5 threadblocksize
6 regperthread
1 $ ./fma1
2 nicolasw@mini:~/CP/Slides/10-CUDA2/FMA4$ cat cuda_profile_0.log
3 # CUDA_PROFILE_LOG_VERSION 2.0
4 # CUDA_DEVICE 0 Tesla C2070
5 # CUDA_CONTEXT 1
6 # TIMESTAMPFACTOR fffff6bf46bd56c8
7 method,gputime,cputime,gridsizeX,gridsizeY,threadblocksizeX,threadblocksizeY,threadblocksizeZ,dynsmemperblock,stasmemperblock,regperthread,occupancy
8 method=[ memcpyHtoH ] gputime=[ 65915.484 ] cputime=[ 65918.000 ]
9 method=[ memcpyHtoH ] gputime=[ 65818.367 ] cputime=[ 65819.000 ]
10 method=[ memcpyHtoH ] gputime=[ 64073.633 ] cputime=[ 64074.996 ]
11 method=[ memcpyHtoH ] gputime=[ 64299.680 ] cputime=[ 64301.004 ]
something is rotten in the state of Denmark. No aparece ningún kernel y para colmo del males dice que la transferencia a memoria es host to host.
¿Cómo decía que estaba todo bien en la comparación de valores?
cuda-gdb
1 $ cuda-gdb ./fma1
2 NVIDIA (R) CUDA Debugger
3 4.2 release
4 ...
5 Reading symbols from /home/nicolasw/CP/Slides/10-CUDA2/FMA4/fma1...done.
6 (cuda-gdb) l
7 8 float ha[N]={0.0f}, hb[N]={0.0f}, hc[N]={0.0f}, hd[N]={0.0f};
8 9
9 10 __global__ void fma4(void) {
10 11 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
11 12 d[tid] = a[tid]*b[tid]+c[tid];
12 13 }
13 14
14 15 int main(void)
15 16 {
16 17 fma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
17 (cuda-gdb) break 12
18 Breakpoint 1 at 0x400a54: file fma1.cu, line 12.
19 (cuda-gdb) run
20 Starting program: /home/nicolasw/CP/Slides/10-CUDA2/FMA4/fma1
21 [Thread debugging using libthread_db enabled]
22
23 Breakpoint 1, fma4 () at fma1.cu:13
24 13 }
25 (cuda-gdb) info cuda kernels
26 No CUDA kernels.
27
28 Mhhh, algo sigue podrido acá.
(1<<25) / 128 = 1<<(25-7) = 1<<18
> 1<<16 = 65536
Hay un límite del hardware:
1 $ /opt/cudasdk/C/bin/linux/release/deviceQuery | grep grid
2 Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
No puedo copiar memoria global o constante con cudaMemcpy
.
Quiero que explote y que suene bien fuerte.
Homenaje al Dir. Rossetti, IPET 20, Córdoba, Argentina.
cutil.h
y sus macrosTengo dos macros para revisar errores.
CUT_CHECK_ERROR(char *)
1 fma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
2 CUT_CHECK_ERROR("fma kernel failed");
CUDA_SAFE_CALL
1 CUDA_SAFE_CALL(cudaMemcpyFromSymbol(ha, a, N*sizeof(float)));
1 #include <cuda.h>
2 #include <stdio.h>
3 #include <cutil.h>
4
5 #define N (1<<22)
6 #define BLOCK_SIZE 128
7
8 __device__ float a[N], b[N], c[N], d[N];
9 float ha[N]={1.0f}, hb[N]={2.0f}, hc[N]={3.0f}, hd[N]={4.0f};
10
11 __global__ void set(void) {
12 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
13 a[tid] = (float)blockIdx.x;
14 b[tid] = (float)threadIdx.x;
15 c[tid] = (float)threadIdx.x+blockIdx.x;
16 }
17
18 __global__ void fma4(void) {
19 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
20 d[tid] = a[tid]*b[tid]+c[tid];
21 }
blockDim.x
.0.0f
.ha
, ..., hd
) algo que no cumpla la igualdad. 1 int main(void)
2 {
3 set<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
4 fma4<<<N/BLOCK_SIZE, BLOCK_SIZE>>>();
5 cudaDeviceSynchronize();
6 CUT_CHECK_ERROR("fma kernel failed");
7 CUDA_SAFE_CALL(cudaMemcpyFromSymbol(ha, a, N*sizeof(float)));
8 CUDA_SAFE_CALL(cudaMemcpyFromSymbol(hb, b, N*sizeof(float)));
9 CUDA_SAFE_CALL(cudaMemcpyFromSymbol(hc, c, N*sizeof(float)));
10 CUDA_SAFE_CALL(cudaMemcpyFromSymbol(hd, d, N*sizeof(float)));
11 for (unsigned int i=0; i<N; ++i)
12 if (hd[i] != ha[i]*hb[i]+hc[i]) {
13 printf("%d, %f!=%f*%f+%f\n", i, hd[i],ha[i],hb[i],hc[i]);
14 break;
15 }
16 return 0;
17 }
Compilación y corrida exitosa. Ahora si hay reporte de kernels corriendo.
1 $ export CUDA_PROFILE=1
2 $ ./fma
3 $ cat cuda_profile_0.log
4 ...
5 method=[ _Z3setv ] gputime=[ 755.168 ] cputime=[ 6.000 ] occupancy=[ 0.667 ]
6 method=[ _Z4fma4v ] gputime=[ 883.584 ] cputime=[ 4.000 ] occupancy=[ 0.667 ]
7 method=[ memcpyDtoH ] gputime=[ 8100.096 ] cputime=[ 9044.000 ]
8 method=[ memcpyDtoH ] gputime=[ 8089.888 ] cputime=[ 9028.000 ]
9 method=[ memcpyDtoH ] gputime=[ 8094.144 ] cputime=[ 9028.000 ]
10 method=[ memcpyDtoH ] gputime=[ 8096.832 ] cputime=[ 9041.000 ]
11 $
CUDA_SAFE_CALL
.CUT_CHECK_ERROR
.blockDim.x*blockDim.y*blockDim.z<=1024
, etc..
1 $ /opt/cudasdk/C/bin/linux/release/deviceQuery
2 ...
3 Maximum number of threads per block: 1024
4 Maximum sizes of each dimension of a block: 1024 x 1024 x 64
5 Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535
6 ...
sgemm
Un hilo: c[i][j] = a[i][]*b[][j]
.
N×N
definidas en runtime.BX×BY
definidos en runtime.N
múltiplo de BX
o BY
. 1 // 2D to 1D bijection
2 #define IX(i,j) ((i)*(N)+(j))
3
4 __global__ void sgemm(const uint N, float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
5 const uint i = blockIdx.y*blockDim.y + threadIdx.y;
6 const uint j = blockIdx.x*blockDim.x + threadIdx.x;
7 if (i<N && j<N)
8 for (uint k=0; k<N; ++k)
9 c[IX(i,j)] += a[IX(i,k)] * b[IX(k,j)];
10 }
Notar
__restrict__
.blockDim.{x,y}
para conocer el tamaño del bloque en runtime.uint
.1 CUDA_SAFE_CALL(cudaMalloc(&d_a, SIZE * sizeof(float)));
2 CUDA_SAFE_CALL(cudaMalloc(&d_b, SIZE * sizeof(float)));
3 CUDA_SAFE_CALL(cudaMalloc(&d_c, SIZE * sizeof(float)));
4 h_a = (float *) calloc(SIZE, sizeof(float));
5 h_b = (float *) calloc(SIZE, sizeof(float));
6 h_c = (float *) calloc(SIZE, sizeof(float));
7 assert(d_a && d_b && d_c && h_a && h_b && h_c);
1 // integer ceiling division
2 #define DIV_CEIL(a,b) (((a)+(b)-1)/(b))
3
4 dim3 grid_size(DIV_CEIL(N,BX), DIV_CEIL(N,BY));
5 dim3 block_size(BX,BY);
6 setmm<<<grid_size, block_size>>>(N, d_a, d_b, d_c);
7 CUT_CHECK_ERROR("setmm kernel failed");
8 sgemm<<<grid_size, block_size>>>(N, d_a, d_b, d_c);
9 CUT_CHECK_ERROR("sgemm kernel failed");
10 cudaDeviceSynchronize();
Notar:
DIV_CEIL
para que haya suficientes bloques. 1 CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, SIZE*sizeof(float), cudaMemcpyDefault));
2 CUDA_SAFE_CALL(cudaMemcpy(h_b, d_b, SIZE*sizeof(float), cudaMemcpyDefault));
3 CUDA_SAFE_CALL(cudaMemcpy(h_c, d_c, SIZE*sizeof(float), cudaMemcpyDefault));
4 double max_diff = 0.0;
5 for (unsigned int i=0; i<N; ++i) {
6 for (unsigned int j=0; j<N; ++j) {
7 float cij = 0.0f;
8 for (unsigned int k=0; k<N; ++k)
9 cij += h_a[IX(i,k)] * h_b[IX(k,j)];
10 max_diff = MAX(max_diff, abs(cij-h_c[IX(i,j)]));
11 }
12 }
13 printf("max_diff: %f\n", max_diff);
Notar:
sgemm N BX BY
1 $ ./sgemm 1024 64 64
2 Cuda error: setmm kernel failed in file 'sgemm.cu' in line 54 : invalid configuration argument.
3 $ ./sgemm 1024 32 32
4 max_diff: 0.000092
5 $ ./sgemm 512 32 32
6 max_diff: 0.000046
7 $ ./sgemm 2048 32 32
8 max_diff: 0.000183
9 $ ./sgemm 256 1 1
10 max_diff: 0.000023
ver porque pasa esto.
1 $ nvcc -O3 -arch=sm_20 -I/opt/cudasdk/C/common/inc sgemm.cu -ptx
sgemm
1 .entry _Z5sgemmjPfS_S_(
2 ...
3 ld.param.u32 %r1, [_Z5sgemmjPfS_S__param_0];
4 ld.param.u64 %rl5, [_Z5sgemmjPfS_S__param_1];
5 ld.param.u64 %rl6, [_Z5sgemmjPfS_S__param_2];
6 cvta.to.global.u64 %rl2, %rl6;
7 cvta.to.global.u64 %rl3, %rl5;
8 mov.u32 %r2, %ntid.y;
9 mov.u32 %r3, %ctaid.y;
10 mov.u32 %r4, %tid.y;
11 mad.lo.s32 %r5, %r2, %r3, %r4;
12 mov.u32 %r6, %ntid.x;
13 mov.u32 %r7, %ctaid.x;
14 mov.u32 %r8, %tid.x;
15 mad.lo.s32 %r33, %r6, %r7, %r8;
16 setp.ge.u32 %p1, %r5, %r1;
17 setp.ge.u32 %p2, %r33, %r1;
18 or.pred %p3, %p2, %p1;
19 setp.eq.s32 %p4, %r1, 0;
20 or.pred %p5, %p3, %p4;
21 @%p5 bra BB1_4;
1 ld.param.u32 %r26, [_Z5sgemmjPfS_S__param_0];
2 mad.lo.s32 %r19, %r5, %r26, %r33;
3 ld.param.u64 %rl13, [_Z5sgemmjPfS_S__param_3];
4 cvta.to.global.u64 %rl7, %rl13;
5 mul.wide.u32 %rl8, %r19, 4;
6 add.s64 %rl4, %rl7, %rl8;
7 ld.global.f32 %f6, [%rl4];
8 mul.lo.s32 %r34, %r26, %r5;
9 mov.u32 %r35, 0;
10
11 BB1_2:
12 mul.wide.u32 %rl9, %r34, 4;
13 add.s64 %rl10, %rl3, %rl9;
14 mul.wide.u32 %rl11, %r33, 4;
15 add.s64 %rl12, %rl2, %rl11;
16 ld.global.f32 %f4, [%rl12];
17 ld.global.f32 %f5, [%rl10];
18 fma.rn.f32 %f6, %f5, %f4, %f6;
19 add.s32 %r34, %r34, 1;
20 ld.param.u32 %r25, [_Z5sgemmjPfS_S__param_0];
21 add.s32 %r33, %r33, %r25;
22 add.s32 %r35, %r35, 1;
23 setp.lt.u32 %p6, %r35, %r25;
24 @%p6 bra BB1_2;
25
26 st.global.f32 [%rl4], %f6;
27
28 BB1_4:
29 ret;
-ptxas-options=-v
1 $ nvcc sgemm.cu -O3 -arch=sm_20 --ptxas-options=-v -I/opt/cudasdk/C/common/inc -o sgemm.o
2 ptxas info : Compiling entry function '_Z5sgemmjPfS_S_' for 'sm_20'
3 ptxas info : Function properties for _Z5sgemmjPfS_S_
4 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
5 ptxas info : Used 16 registers, 64 bytes cmem[0], 24 bytes cmem[2]
regperthread
1 $ export CUDA_PROFILE=1
2 $ export CUDA_PROFILE_CONFIG=profile_debug_kernel
3 $ cat profile_debug_kernel
4 gridsize
5 threadblocksize
6 regperthread
7 $ ./sgemm 1024 16 16
8 max_diff: 0.000092
9 $ cat cuda_profile_0.log
10 ...
11 method=[ _Z5sgemmjPfS_S_ ] gputime=[ 34407.359 ] cputime=[ 4.000 ]
12 gridsize=[ 64, 64 ] threadblocksize=[ 16, 16, 1 ] regperthread=[ 16 ]
13 occupancy=[ 1.000 ]
14 ...
Ambas informan 16 registros.
gputime
vs. BX
,BY
gputime
en µs
gputime
vs. BX
,BY
(zoom)gputime
en µs
BX
=16,24,32.gputime
= 33.264ms para 2×1024^3 FLOP: 60 GFLOPS>>> ((2*1024**3)/0.033) / (1<<30)
60.6060606060606
1 $ grep -v "^ " sgemm-c2070.dat | sort -n -k 3 | head -16
2 32 5 33264.832
3 32 6 33275.199
4 32 24 33732.734
5 32 4 33780.031
6 ...
7 16 10 35538.559
8 32 3 35789.473
9 32 8 35800.672
a
se lee y la c
se escribe de a warps de 32 hilos accediendo a 128 bytes consecutivos de la memoria.1 block_size ths_per_block threads_per_sm limited_by occupancy gputime
2 8×8 64 512 (8 b) blks_per_sm 512/1536=0.33 53 ms
3 16×16 256 1536 (8 b) warps_per_sm 1536/1536=1.0 38 ms
4 32×32 1024 1024 (1 b) warps_per_sm 1024/1536=0.66 38 ms
5 8×16 128 1024 (8 b) blks_per_sm 1024/1536=0.66 45 ms
6 16×8 128 1024 (8 b) blks_per_sm 1024/1536=0.66 35 ms
7 2×32 64 512 (8 b) blks_per_sm 512/1536=0.33 182 ms
8 32×2 64 512 (8 b) blks_per_sm 512/1536=0.33 49 ms
9 32×5 160 1280 (8 b) blks_per_sm 1280/1536=0.83 33 ms
occupancy
no es proporcional a la performance.Los registros por bloque no nos molestan en este caso, solo tenemos 16 registros en el kernel.
Los hilos por bloque tampoco, la mejor performance es para una muy baja cantidad de hilos por bloque.
c[i][j]
1 __global__ void sgemm_multi(const uint N, const uint MULTI, float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 const uint i = blockIdx.y*blockDim.y + threadIdx.y;
3 const uint j = MULTI*(blockIdx.x*blockDim.x + threadIdx.x);
4 if (i<N && j<N)
5 for (uint jj=j; jj<j+MULTI; ++jj)
6 for (uint k=0; k<N; ++k)
7 c[IX(i,jj)] += a[IX(i,k)] * b[IX(k,jj)];
8 }
1 dim3 grid_size_sgemm(DIV_CEIL(N,BX*MULTI), DIV_CEIL(N,BY));
2 dim3 block_size_sgemm(BX,BY);
3 sgemm_multi<<<grid_size_sgemm, block_size_sgemm>>>(N, MULTI, d_a, d_b, d_c);
(i,j) <-> { (i,j*MULTI),...,(i,j*MULTI+MULTI-1) }
No hay mejora de performance.
En cambio un manual unrolling mejora un 5% a 10% la performance.
1 __global__ void sgemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb[B][B];
4
5 const uint ti = threadIdx.y; const uint tj = threadIdx.x;
6 const uint i = blockIdx.y*B + ti;
7 const uint j = blockIdx.x*B + tj;
8 float cij = 0.0f;
9 for (uint m=0; m<N/B; ++m) {
10 sa[ti][tj] = a[IX(i,m*B+tj)];
11 sb[ti][tj] = b[IX(m*B+ti,j)];
12 __syncthreads();
13
14 for (uint k=0; k<B; ++k)
15 cij += sa[ti][k] * sb[k][tj];
16 __syncthreads();
17 }
18 c[IX(i,j)] = cij;
19 }
Cambiamos de una estrategia de const
a #define
.
Simplifica algunas cosas, requiere recompilar.
1 #ifndef N
2 #define N 1024
3 #endif
4
5 #ifndef B
6 #define B 32
7 #endif
8
9 #define SIZE (N*N)
10 assert(N%B==0); // evenly split
11 ...
12
13 dim3 grid_size(N/B, N/B);
14 dim3 block_size(B,B);
15 ...
16 sgemm<<<grid_size, block_size>>>(d_a, d_b, d_c);
Notar que no tenemos el típico if
en el kernel:
1 if (i<N && j<N) {
2 ...
3 }
N=1024
1 Block Shared gputime
2 8×8 0.5 KB 20ms
3 16×16 2 KB 12.1ms
4 32×32 8 KB 12.2ms
La performance trepó a 166 GFLOPS.
Para GF100 hay 48KB de shared por SM.
El peor caso es bloques de 16×16. Tenemos 256 hilos, 8 bloques máximo por SM, necesito 8×2KB = 16KB de shared.
Notar que podría ser otro factor que limita la concurrencia.
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |