A partir de CUDA 6.0 y CC 3.5. Para prototipar soluciones.
1 checkCudaErrors(cudaMallocManaged(&a, SIZE * sizeof(float)));
2 checkCudaErrors(cudaMallocManaged(&b, SIZE * sizeof(float)));
3 checkCudaErrors(cudaMallocManaged(&c, SIZE * sizeof(float)));
4 assert(a && b && c);
5
6 dim3 grid_size(DIV_CEIL(N,BX), DIV_CEIL(N,BY));
7 dim3 block_size(BX,BY);
8 setmm<<<grid_size, block_size>>>(N, a, b, c);
9 getLastCudaError("setmm kernel failed");
10 sgemm<<<grid_size, block_size>>>(N, a, b, c);
11 getLastCudaError("sgemm kernel failed");
12 cudaDeviceSynchronize();
13
14 double max_diff = 0.0;
15 for (unsigned int i=0; i<N; ++i) {
16 for (unsigned int j=0; j<N; ++j) {
17 float cij = 0.0f;
18 for (unsigned int k=0; k<N; ++k)
19 cij += a[IX(i,k)] * b[IX(k,j)];
20 max_diff = MAX(max_diff, abs(cij-c[IX(i,j)]));
21 }
22 }
23 printf("max_diff: %f\n", max_diff);
24
25 checkCudaErrors(cudaFree(c));
26 checkCudaErrors(cudaFree(b));
27 checkCudaErrors(cudaFree(a));
1 $ nvprof --print-gpu-summary ./sgemm-unified 1024 16 16
2 ==16422== NVPROF is profiling process 16422, command: ./sgemm-unified 1024 16 16
3 max_diff: 0.000092
4 ==16422== Profiling application: ./sgemm-unified 1024 16 16
5 ==16422== Profiling result:
6 Time(%) Time Calls Avg Min Max Name
7 98.73% 15.982ms 1 15.982ms 15.982ms 15.982ms sgemm(unsigned int, float*, float*, float*)
8 1.27% 206.31us 1 206.31us 206.31us 206.31us setmm(unsigned int, float*, float*, float*)
No aparecen los llamados a cudaMemcpy()
o similares.
El walltime es similar a la versión con copia a mano! (ojo la versión sgemm
no usaba cudaMallocHost
).
1 $ time ./sgemm-unified 1024 16 16
2 max_diff: 0.000092
3
4 real 0m9.259s
5 user 0m8.816s
6 sys 0m0.404s
7 $ time ./sgemm 1024 16 16
8 max_diff: 0.000092
9
10 real 0m10.107s
11 user 0m9.748s
12 sys 0m0.312s
c[i][j]
1 __global__ void sgemm_multi(const uint N, const uint MULTI, float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 const uint i = blockIdx.y*blockDim.y + threadIdx.y;
3 const uint j = MULTI*(blockIdx.x*blockDim.x + threadIdx.x);
4 if (i<N && j<N)
5 for (uint jj=j; jj<j+MULTI; ++jj)
6 for (uint k=0; k<N; ++k)
7 c[IX(i,jj)] += a[IX(i,k)] * b[IX(k,jj)];
8 }
1 dim3 grid_size_sgemm(DIV_CEIL(N,BX*MULTI), DIV_CEIL(N,BY));
2 dim3 block_size_sgemm(BX,BY);
3 sgemm_multi<<<grid_size_sgemm, block_size_sgemm>>>(N, MULTI, d_a, d_b, d_c);
(i,j) <-> { (i,j*MULTI),...,(i,j*MULTI+MULTI-1) }
1 $ for i in {1,2,4,8}; do ./sgemm-multi 1024 16 16 $i; done
2 max_diff: 0.000092
3 max_diff: 0.000092
4 max_diff: 0.000092
5 max_diff: 0.000092
1 $ for i in {1,2,4,8}; do nvprof ./sgemm-multi 1024 16 16 $i 2>&1 | grep sgemm_multi; done
2 87.09% 14.618ms 1 14.618ms 14.618ms 14.618ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
3 87.29% 14.902ms 1 14.902ms 14.902ms 14.902ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
4 88.54% 16.768ms 1 16.768ms 16.768ms 16.768ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
5 94.84% 40.197ms 1 40.197ms 40.197ms 40.197ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
No hay mejora de performance.
Pero si en la comunicación, usé cudaMallocHost()
y pasé de 2.5 GiB/s a 6.4 GiB/s.
1 $ nvprof --print-gpu-trace ./sgemm-multi 1024 16 16 1
2 ...
3 400.32ms 654.51us 4.1943MB 6.4084GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
4 401.01ms 653.35us 4.1943MB 6.4197GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
5 401.69ms 654.76us 4.1943MB 6.4059GB/s Tesla K40c (0) 1 7 [CUDA memcpy DtoH]
(NVIDIA, NVIDIA CUDA C Programming Guide)
1 __global__ void sgemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb[B][B];
4
5 const uint ti = threadIdx.y; const uint tj = threadIdx.x;
6 const uint i = blockIdx.y*B + ti;
7 const uint j = blockIdx.x*B + tj;
8 float cij = 0.0f;
9 for (uint m=0; m<N/B; ++m) {
10 sa[ti][tj] = a[IX(i,m*B+tj)];
11 sb[ti][tj] = b[IX(m*B+ti,j)];
12 __syncthreads();
13
14 for (uint k=0; k<B; ++k)
15 cij += sa[ti][k] * sb[k][tj];
16 __syncthreads();
17 }
18 c[IX(i,j)] = cij;
19 }
Cambiamos de una estrategia de const
a #define
.
Simplifica algunas cosas, requiere recompilar.
1 #ifndef N
2 #define N 1024
3 #endif
4
5 #ifndef B
6 #define B 32
7 #endif
8
9 #define SIZE (N*N)
10 assert(0 == N%B); // evenly split
11 ...
12
13 dim3 grid_size(N/B, N/B);
14 dim3 block_size(B,B);
15 ...
16 sgemm<<<grid_size, block_size>>>(d_a, d_b, d_c);
Notar que no tenemos el típico if
en el kernel:
1 if (i<N && j<N) {
2 ...
3 }
N=1024
1 Block Shared gputime
2 8×8 0.5 KB 20.0 ms
3 16×16 2 KB 12.1 ms
4 32×32 8 KB 12.2 ms
La performance trepó a 165 GFLOPS.
1 >>> ((2*1024**3)/0.0121) / (1<<30)
2 165.28925619834712
1 Version GFLOPS EFICIENCIA
2 Trivial 60 6%
3 Shared 165 16%
N=1024
1 Block Shared gputime GFLOPS Eff
2 8×8 0.5 KB 8.20 ms 243 5%
3 16×16 2 KB 5.30 ms 377 8%
4 32×32 8 KB 5.18 ms 386 8.5%
Aumenté mucho los GFLOPs, pero la eficiencia bajó dramáticamente.
Típico de "mejoré para Fermi, pero para Kepler no camina".
Hay una base de 2x de speedup de full Fermi a full Kepler.
Pero la eficiencia 0.5x.
Moraleja: Kepler Tuning Guide.
Hwu trick, sección 6.6.
Reutilizar el block sa
(Md) para dos o más bloques sb
(Nd).
1 __global__ void sgemm_shared2(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb0[B][B];
4 __shared__ float sb1[B][B];
5
6 const uint ti = threadIdx.y;
7 const uint tj = threadIdx.x;
8 const uint i = blockIdx.y*B + ti;
9 const uint j0 = (2*blockIdx.x+0)*B + tj;
10 const uint j1 = (2*blockIdx.x+1)*B + tj;
11 float cij0 = 0.0f, cij1 = 0.0f;
12 for (uint m=0; m<N/B; ++m) {
13 sa[ti][tj] = a[IX(i,m*B+tj)];
14 sb0[ti][tj] = b[IX(m*B+ti,j0)];
15 sb1[ti][tj] = b[IX(m*B+ti,j1)];
16 __syncthreads();
17
18 for (uint k=0; k<B; ++k) {
19 cij0 += sa[ti][k] * sb0[k][tj];
20 cij1 += sa[ti][k] * sb1[k][tj];
21 }
22 __syncthreads();
23 }
24 c[IX(i,j0)] = cij0;
25 c[IX(i,j1)] = cij1;
26 }
Se lanzan grid_size.x /= 2
bloques.
Para N=1024, B=16
corriendo en K40 con nvcc-6.0
.
Los otros tamaños de bloque posible (8 y 32) dan peores resultados.
1 Gránulo Tiempo GFLOPS
2 1 5.36ms 373
3 2 4.36ms 458
4 4 3.92ms 510
1 Gránulo Regs ShMem
2 1 25 2048
3 2 32 3072
4 4 37 5120
Mejoramos pero estamos aun lejos de una eficiencia decente.
Menos hilos, pero más bloques por SMX.
Vasily Volkov, Better performance at lower occupancy, GTC, 2010.
La primera parte es igual al código de shared
.
1 __global__ void sgemm_shared(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb[B][B];
4
5 const uint ti = threadIdx.y;
6 const uint tj = threadIdx.x;
7 const uint i = blockIdx.y*B + ti;
8 const uint j = blockIdx.x*B + tj;
Idea: hacer más trabajo por hilo, aumentar la granularidad, pero ...
block_size.y /= 2
1 float cij[2] = {0.0f, 0.0f};
2 for (uint m=0; m<N/B; ++m) {
3 sa[ti][tj] = a[IX(i,m*B+tj)];
4 sb[ti][tj] = b[IX(m*B+ti,j)];
5 sa[ti+B/2][tj] = a[IX(i+B/2,m*B+tj)];
6 sb[ti+B/2][tj] = b[IX(m*B+ti+B/2,j)];
7 __syncthreads();
8
9 for (uint k=0; k<B; ++k) {
10 cij[0] += sa[ti][k] * sb[k][tj];
11 cij[1] += sa[ti+B/2][k] * sb[k][tj];
12 }
13 __syncthreads();
14 }
15 c[IX(i,j)] = cij[0];
16 c[IX(i+B/2,j)] = cij[1];
17 }
Tradeoff entre TLP (thread level paralelism) e ILP (instruction level paralelism).
Para N=1024, B=32
en una K40 compilado con nvcc-6.0
.
1 Grano Regs ShMem Time(ms) GFLOPS
2 1 21 8192 5.18 386
3 2 31 8192 3.46 578
4 4 42 8192 2.73 732
5 8 49 8192 2.53 790
6 16 40 8192 22.89 87
Volvimos a tener un ~17% de eficiencia.
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |