SGEMM cont'd:
A partir de CUDA 6.0 y CC 3.5. Para prototipar soluciones.
1 checkCudaErrors(cudaMallocManaged(&a, SIZE * sizeof(float)));
2 checkCudaErrors(cudaMallocManaged(&b, SIZE * sizeof(float)));
3 checkCudaErrors(cudaMallocManaged(&c, SIZE * sizeof(float)));
4 assert(a && b && c);
5
6 dim3 grid_size(DIV_CEIL(N,BX), DIV_CEIL(N,BY));
7 dim3 block_size(BX,BY);
8 setmm<<<grid_size, block_size>>>(N, a, b, c);
9 getLastCudaError("setmm kernel failed");
10 sgemm<<<grid_size, block_size>>>(N, a, b, c);
11 getLastCudaError("sgemm kernel failed");
12 cudaDeviceSynchronize();
13
14 double max_diff = 0.0;
15 for (size_t i=0; i<N; ++i) {
16 for (size_t j=0; j<N; ++j) {
17 float cij = 0.0f;
18 for (size_t k=0; k<N; ++k)
19 cij += a[IX(i,k)] * b[IX(k,j)];
20 max_diff = MAX(max_diff, abs(cij-c[IX(i,j)]));
21 }
22 }
23 printf("max_diff: %f\n", max_diff);
24
25 checkCudaErrors(cudaFree(c));
26 checkCudaErrors(cudaFree(b));
27 checkCudaErrors(cudaFree(a));
1 $ nvprof --print-gpu-summary ./sgemm-unified 1024 16 16
2 ==17649== NVPROF is profiling process 17649, command: ./sgemm-unified 1024 16 16
3 max_diff: 0.000671
4 ==17649== Profiling application: ./sgemm-unified 1024 16 16
5 ==17649== Profiling result:
6 Time(%) Time Calls Avg Min Max Name
7 99.24% 146.81ms 1 146.81ms 146.81ms 146.81ms sgemm(unsigned int, float*, float*, float*)
8 0.76% 1.1184ms 1 1.1184ms 1.1184ms 1.1184ms setmm(unsigned int, float*, float*, float*)
No aparecen los llamados a cudaMemcpy()
o similares.
¡El walltime es similar a la versión con copia a mano!
1 $ time ./sgemm-unified 1024 16 16
2
3 real 0m0.454s
4 user 0m0.136s
5 sys 0m0.300s
6 $ time ./sgemm 1024 16 16
7
8 real 0m0.292s
9 user 0m0.012s
10 sys 0m0.280s
c[i][j]
1 __global__ void sgemm_multi(const uint N, const uint MULTI, float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 const uint i = blockIdx.y*blockDim.y + threadIdx.y;
3 const uint j = MULTI*(blockIdx.x*blockDim.x + threadIdx.x);
4 if (i<N && j<N)
5 for (uint jj=j; jj<j+MULTI; ++jj)
6 for (uint k=0; k<N; ++k)
7 c[IX(i,jj)] += a[IX(i,k)] * b[IX(k,jj)];
8 }
1 dim3 grid_size_sgemm(DIV_CEIL(N,BX*MULTI), DIV_CEIL(N,BY));
2 dim3 block_size_sgemm(BX,BY);
3 sgemm_multi<<<grid_size_sgemm, block_size_sgemm>>>(N, MULTI, d_a, d_b, d_c);
(i,j) <-> { (i,j*MULTI),...,(i,j*MULTI+MULTI-1) }
1 $ for i in {1,2,4,8}; do ./sgemm-multi 1024 16 16 $i; done
2 max_diff: 0.000092
3 max_diff: 0.000092
4 max_diff: 0.000092
5 max_diff: 0.000092
1 $ for i in {1,2,4,8}; do nvprof ./sgemm-multi 1024 16 16 $i 2>&1 | grep sgemm_multi; done
2 6.8601ms 1 6.8601ms 6.8601ms 6.8601ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
3 7.0656ms 1 7.0656ms 7.0656ms 7.0656ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
4 16.195ms 1 16.195ms 16.195ms 16.195ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
5 19.135ms 1 19.135ms 19.135ms 19.135ms sgemm_multi(unsigned int, unsigned int, float*, float*, float*)
No hay mejora de performance.
Pero si en la comunicación, usé cudaMallocHost()
y pasé de 2.5 GiB/s a 12 GiB/s.
1 $ nvprof --print-gpu-trace ./sgemm-multi 1024 16 16 1
2 ...
3 402.54ms 327.01us 4.0000MB 11.945GB/s GeForce GTX TIT [CUDA memcpy DtoH]
4 402.89ms 323.52us 4.0000MB 12.074GB/s GeForce GTX TIT [CUDA memcpy DtoH]
5 403.22ms 323.52us 4.0000MB 12.074GB/s GeForce GTX TIT [CUDA memcpy DtoH]
(NVIDIA, NVIDIA CUDA C Programming Guide)
1 __global__ void sgemm(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb[B][B];
4
5 const uint ti = threadIdx.y; const uint tj = threadIdx.x;
6 const uint i = blockIdx.y*B + ti;
7 const uint j = blockIdx.x*B + tj;
8 float cij = 0.0f;
9 for (uint m=0; m<N/B; ++m) {
10 sa[ti][tj] = a[IX(i,m*B+tj)];
11 sb[ti][tj] = b[IX(m*B+ti,j)];
12 __syncthreads();
13
14 for (uint k=0; k<B; ++k)
15 cij += sa[ti][k] * sb[k][tj];
16 __syncthreads();
17 }
18 c[IX(i,j)] = cij;
19 }
Cambiamos de una estrategia de const
a #define
.
Simplifica algunas cosas, requiere recompilar.
1 #ifndef N
2 #define N 1024
3 #endif
4
5 #ifndef B
6 #define B 32
7 #endif
8
9 #define SIZE (N*N)
10 assert(0 == N%B); // evenly split
11 ...
12
13 dim3 grid_size(N/B, N/B);
14 dim3 block_size(B,B);
15 ...
16 sgemm<<<grid_size, block_size>>>(d_a, d_b, d_c);
Notar que no tenemos el típico if
en el kernel:
1 if (i<N && j<N) {
2 ...
3 }
N=1024
, C2070, potencia pico 1.030 TFLOPS single precision.
1 Block Shared gputime GFLOPS Eff
2 8×8 0.5 KB 20.0 ms 100 10%
3 16×16 2 KB 12.1 ms 165 16%
4 32×32 8 KB 12.2 ms 163 16%
La performance trepó a 165 GFLOPS.
1 >>> ((2*1024**3)/0.0121) / (1<<30)
2 165.28925619834712
1 Version GFLOPS EFICIENCIA
2 Trivial 60 6%
3 Shared 165 16%
N=1024
, K40, potencia pico 4.291 TFLOPS single precision.
1 Block Shared gputime GFLOPS Eff
2 8×8 0.5 KB 8.20 ms 243 5%
3 16×16 2 KB 5.30 ms 377 8%
4 32×32 8 KB 5.18 ms 386 8.5%
Aumenté mucho los GFLOPs, pero la eficiencia bajó dramáticamente.
Típico de "mejoré para Fermi, pero para Kepler no camina".
Hay una base de 2x de speedup de full Fermi a full Kepler.
Pero la eficiencia 0.5x.
Moraleja: Kepler Tuning Guide.
N=1024
, GTX Titan X, potencia pico 6.144 TFLOPS single precision.
1 Block Shared gputime GFLOPS Eff
2 8×8 0.5 KB 3.07 ms 651 10.5%
3 16×16 2 KB 2.65 ms 754 12%
4 32×32 8 KB 2.67 ms 749 12%
La eficiencia mejoró.
Hwu trick, sección 6.6.
Reutilizar el block sa
(Md) para dos o más bloques sb
(Nd).
1 __global__ void sgemm_shared2(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb0[B][B];
4 __shared__ float sb1[B][B];
5
6 const uint ti = threadIdx.y;
7 const uint tj = threadIdx.x;
8 const uint i = blockIdx.y*B + ti;
9 const uint j0 = (2*blockIdx.x+0)*B + tj;
10 const uint j1 = (2*blockIdx.x+1)*B + tj;
11 float cij0 = 0.0f, cij1 = 0.0f;
12 for (uint m=0; m<N/B; ++m) {
13 sa[ti][tj] = a[IX(i,m*B+tj)];
14 sb0[ti][tj] = b[IX(m*B+ti,j0)];
15 sb1[ti][tj] = b[IX(m*B+ti,j1)];
16 __syncthreads();
17
18 for (uint k=0; k<B; ++k) {
19 cij0 += sa[ti][k] * sb0[k][tj];
20 cij1 += sa[ti][k] * sb1[k][tj];
21 }
22 __syncthreads();
23 }
24 c[IX(i,j0)] = cij0;
25 c[IX(i,j1)] = cij1;
26 }
Se lanzan grid_size.x /= 2
bloques.
Para N=1024, B=16
corriendo en K40 con nvcc-6.0
.
Los otros tamaños de bloque posible (8 y 32) dan peores resultados.
1 Gránulo Tiempo GFLOPS
2 1 5.36ms 373
3 2 4.36ms 458
4 4 3.92ms 510
1 Gránulo Regs ShMem
2 1 25 2048
3 2 32 3072
4 4 37 5120
Mejoramos pero estamos aun lejos de una eficiencia decente.
Para N=1024, B=16
corriendo en GTX Titan X con nvcc-7.5
.
Los otros tamaños de bloque posible (8 y 32) dan peores resultados.
1 Gránulo Tiempo GFLOPS
2 1 2.65ms 754
3 2 2.10ms 952
4 4 1.89ms 1058
1 Gránulo Regs ShMem
2 1 23 2048
3 2 32 3072
4 4 32 5120
¡Pasamos la barrera del TFLOP!
Menos hilos, pero más bloques por SMX.
Vasily Volkov, Better performance at lower occupancy, GTC, 2010.
La primera parte es igual al código de shared
.
1 __global__ void sgemm_shared(float * __restrict__ a, float * __restrict__ b, float * __restrict__ c) {
2 __shared__ float sa[B][B];
3 __shared__ float sb[B][B];
4
5 const uint ti = threadIdx.y;
6 const uint tj = threadIdx.x;
7 const uint i = blockIdx.y*B + ti;
8 const uint j = blockIdx.x*B + tj;
Idea: hacer más trabajo por hilo, aumentar la granularidad, pero ...
block_size.y /= 2
1 float cij[2] = {0.0f, 0.0f};
2 for (uint m=0; m<N/B; ++m) {
3 sa[ti][tj] = a[IX(i,m*B+tj)];
4 sb[ti][tj] = b[IX(m*B+ti,j)];
5 sa[ti+B/2][tj] = a[IX(i+B/2,m*B+tj)];
6 sb[ti+B/2][tj] = b[IX(m*B+ti+B/2,j)];
7 __syncthreads();
8
9 for (uint k=0; k<B; ++k) {
10 cij[0] += sa[ti][k] * sb[k][tj];
11 cij[1] += sa[ti+B/2][k] * sb[k][tj];
12 }
13 __syncthreads();
14 }
15 c[IX(i,j)] = cij[0];
16 c[IX(i+B/2,j)] = cij[1];
17 }
Tradeoff entre TLP (thread level paralelism) e ILP (instruction level paralelism).
Para N=1024, B=32
en una K40 compilado con nvcc-6.0
.
1 Grano Regs ShMem Time(ms) GFLOPS
2 1 21 8192 5.18 386
3 2 31 8192 3.46 578
4 4 42 8192 2.73 732
5 8 49 8192 2.53 790
6 16 40 8192 22.89 87
Volvimos a tener un ~17% de eficiencia.
Para N=1024, B=32
en una GTX Titan compilado con nvcc-7.5
.
1 Grano Regs ShMem Time(ms) GFLOPS
2 1 21 8192 2.61 766
3 2 31 8192 1.67 1197
4 4 42 8192 1.37 1459
5 8 49 8192 1.17 1709
6 16 40 8192 1.79 1117
Aumentamos la eficiencia a un 28%.
¡Wowwww!
1 ...
2 #include <cublas_v2.h>
3 ...
4 float beta = 0.0f;
5 cublasHandle_t handle;
6 cublasStatus_t status;
7 status = cublasCreate(&handle);
8 assert(status == CUBLAS_STATUS_SUCCESS);
9 status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_a, N, d_b, N, &beta, d_c, N);
10 assert(status == CUBLAS_STATUS_SUCCESS);
11
12 checkCudaErrors(cudaMemcpy(h_a, d_a, SIZE*sizeof(float), cudaMemcpyDefault));
13 checkCudaErrors(cudaMemcpy(h_b, d_b, SIZE*sizeof(float), cudaMemcpyDefault));
14 checkCudaErrors(cudaMemcpy(h_c, d_c, SIZE*sizeof(float), cudaMemcpyDefault));
15 status = cublasDestroy(handle);
16 assert(status == CUBLAS_STATUS_SUCCESS);
17 ...
Compilamos con
1 $ nvcc blablebli -lcublas
1 $ ./sgemm-cublas 1024
2 ...
3 26.04% 530.28us 1 530.28us 530.28us 530.28us maxwell_sgemm_128x64_nn
3373 GFLOPS.
55% de eficiencia.
Moraleja: library'em!.
Ojo, para N={2048, 4096}
la eficiencia sube al 79%.
¿Cuál es más eficiente para SGEMM
una CPU o una GPU?
Comparar con lo hecho en CPU.
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |