Nicolás Wolovick, 20140617
OpenMP 4.0 y OpenACC 2.0 están convergiendo.
1 OpenACC CUDA
2 gang block
3 worker warp
4 vector thread
parallel
o kernels
).sgemm
1 #define N 1024
2 float a[N][N], b[N][N], c[N][N];
3
4 int main(void) {
5 unsigned int i = 0, j = 0, k = 0;
6 double start = 0.0;
7
8 start = omp_get_wtime();
9 for (i=0; i<N; ++i)
10 for (j=0; j<N; ++j)
11 for (k=0; k<N; ++k)
12 c[i][j] += a[i][k]*b[k][j];
13 printf("%f", ((long)2*N*N*N)/((1<<30)*(omp_get_wtime()-start)));
14
15 return 0;
16 }
1 $ pgcc -fast -Minfo sgemm.c && ./a.out
2 main:
3 16, Loop interchange produces reordered loop nest: 16,18,17
4 17, Generated an alternate version of the loop
5 Generated vector sse code for the loop
6 Generated 2 prefetch instructions for the loop
7 2.874675
1 start = omp_get_wtime();
2 #pragma omp parallel for shared(a,b,c,start) private(i,j,k)
3 for (i=0; i<N; ++i)
4 for (j=0; j<N; ++j)
5 for (k=0; k<N; ++k)
6 c[i][j] += a[i][k]*b[k][j];
7 printf("%f", ((long)2*N*N*N)/((1<<30)*(omp_get_wtime()-start)));
$ pgcc -fast -mp -Minfo sgemm_openmp.c && OMP_NUM_THREADS=4 ./a.out
main:
17, Parallel region activated
Parallel loop activated with static block schedule
Loop interchange produces reordered loop nest: 17,19,18
18, Generated an alternate version of the loop
Generated vector sse code for the loop
Generated 2 prefetch instructions for the loop
21, Barrier
Parallel region terminated
28.11827
Si hacemos ikj
logramos 5.76 GFLOPS en CPU, pero ninguna mejora en multicore.
1 start = omp_get_wtime();
2 #pragma acc kernels
3 for (i=0; i<N; ++i)
4 for (j=0; j<N; ++j)
5 for (k=0; k<N; ++k)
6 c[i][j] += a[i][k]*b[k][j];
7 printf("%f", ((long)2*N*N*N)/((1<<30)*(omp_get_wtime()-start)));
1 $ pgcc -acc -ta=nvidia -Minfo sgemm_openacc.c && ./a.out
2 main:
3 18, Generating copy(c[0:][0:])
4 Generating copyin(a[0:][0:])
5 Generating copyin(b[0:][0:])
6 Generating compute capability 1.0 binary
7 Generating compute capability 2.0 binary
8 19, Loop is parallelizable
9 20, Loop is parallelizable
10 21, Complex loop carried dependence of 'c' prevents parallelization
11 Loop carried dependence of 'c' prevents parallelization
12 Loop carried backward dependence of 'c' prevents vectorization
13 Inner sequential loop scheduled on accelerator
14 Accelerator kernel generated
15 19, #pragma acc loop gang /* blockIdx.y */
16 20, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
17 21, CC 1.0 : 14 registers; 40 shared, 4 constant, 0 local memory bytes
18 CC 2.0 : 24 registers; 0 shared, 56 constant, 0 local memory bytes
19 8.827215
Veamos en detalle con una opción especial de timing.
1 $ pgcc -acc -ta=nvidia,time sgemm_openacc.c && ./a.out
2
3 Accelerator Kernel Timing data
4 /home/nicolasw/teoricos/Clase22_20140617/sgemm/sgemm_openacc.c
5 main
6 18: region entered 1 time
7 time(us): total=230,057 init=194,436 region=35,621
8 kernels=17,453 data=6,963
9 w/o init: total=35,621 max=35,621 min=35,621 avg=35,621
10 21: kernel launched 1 times
11 grid: [8x1024] block: [128]
12 time(us): total=17,453 max=17,453 min=17,453 avg=17,453
13 8.692020
La entrada a la región tarda mucho. Veamos con nvprof
.
1 $ nvprof ./a.out
2 ==25564== NVPROF is profiling process 25564, command: ./a.out
3 5.428648==25564== Profiling application: ./a.out
4 ==25564== Profiling result:
5 Time(%) Time Calls Avg Min Max Name
6 83.96% 17.513ms 1 17.513ms 17.513ms 17.513ms main_21_gpu
7 10.08% 2.1025ms 12 175.20us 173.83us 179.11us [CUDA memcpy HtoD]
8 5.96% 1.2441ms 1 1.2441ms 1.2441ms 1.2441ms [CUDA memcpy DtoH]
Con create
le digo, "pedí memoria, pero no copies nada".
1 start = omp_get_wtime();
2 #pragma acc data create(a,b,c)
3 {
4 #pragma acc kernels
5 for (i=0; i<N; ++i)
6 for (j=0; j<N; ++j)
7 for (k=0; k<N; ++k)
8 c[i][j] += a[i][k]*b[k][j];
9 }
10 printf("%f", ((long)2*N*N*N)/((1<<30)*(omp_get_wtime()-start)));
¿Funciona? Si
1 $ pgcc -acc -ta=nvidia sgemm_openacc.c && nvprof ./a.out
2 ==30089== NVPROF is profiling process 30089, command: ./a.out
3 5.601192==30089== Profiling application: ./a.out
4 ==30089== Profiling result:
5 Time(%) Time Calls Avg Min Max Name
6 100.00% 17.691ms 1 17.691ms 17.691ms 17.691ms main_21_gpu
¡No hay más copias!
1 $ pgcc -acc -ta=nvidia,time sgemm_openacc.c && nvprof ./a.out
2 ==28882== NVPROF is profiling process 28882, command: ./a.out
3 5.551359
4 Accelerator Kernel Timing data
5 /home/nicolasw/teoricos/Clase22_20140617/sgemm/sgemm_openacc.c
6 ==28882== main
7 18: region entered 1 time
8 Profiling application: ./a.out
9 time(us): total=28,791
10 kernels=17,634
11 21: kernel launched 1 times
12 grid: [8x1024] block: [128]
13 time(us): total=17,634 max=17,634 min=17,634 avg=17,634
14 /home/nicolasw/teoricos/Clase22_20140617/sgemm/sgemm_openacc.c
15 main
16 16: region entered 1 time
17 time(us): total=360,232 init=330,605 region=29,627
18 w/o init: total=29,627 max=29,627 min=29,627 avg=29,627
19 ==28882== Profiling result:
20 Time(%) Time Calls Avg Min Max Name
21 100.00% 17.602ms 1 17.602ms 17.602ms 17.602ms main_21_gpu
22
23 ==28882== API calls:
24 Time(%) Time Calls Avg Min Max Name
25 84.07% 156.02ms 1 156.02ms 156.02ms 156.02ms cuCtxCreate
La creación del contexto y demás cosas quedan por dentro de la medición del tiempo y no mejora nada.
El kernel está tardando 17ms, y lo mejor que habíamos logrando explorando el blocksize 16x16 fue de 16ms.
Notar:
laplace2d
, init 1 #define N 4096
2 #define ITER_MAX 1000
3
4 float A[N][N], Anew[N][N];
5
6 int main(int argc, char **argv)
7 {
8 const float tol = 1.0e-5f;
9 float error = 1.0f;
10
11 memset(A, 0, N*N*sizeof(float));
12
13 // set boundary conditions
14 for (int i=0; i<N; ++i) {
15 A[0][i] = Anew[0][i] = 0.0f;
16 A[N-1][i] = Anew[N-1][i] = 0.0f;
17 float y0 = sinf(M_PI * i / (N-1));
18 A[i][0] = Anew[i][0] = y0;
19 A[i][N-1] = Anew[i][N-1] = y0*expf(-M_PI);
20 }
laplace2d
, loop 1 printf("Jacobi relaxation computation: %d x %d mesh\n", N, N);
2 double start = omp_get_wtime();
3 int iter = 0;
4 #pragma acc data copy(A, Anew)
5 while (tol<error && iter<ITER_MAX) {
6 error = 0.f;
7 #pragma omp parallel for shared(Anew, A)
8 #pragma acc kernels
9 for(int j=1; j<N-1; ++j)
10 for(int i=1; i<N-1; ++i) {
11 Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1]
12 + A[j-1][i] + A[j+1][i]);
13 error = fmaxf( error, fabsf(Anew[j][i]-A[j][i]));
14 }
15
16 #pragma omp parallel for shared(Anew, A)
17 #pragma acc kernels
18 for(int j = 1; j<N-1; ++j)
19 for(int i = 1; i<N-1; ++i)
20 A[j][i] = Anew[j][i];
21
22 if(iter%100==0) printf("%5d, %0.6f\n", iter, error);
23
24 ++iter;
25 }
26
27 printf("Total time: %f s\n", omp_get_wtime()-start);
28 }
1 $ pgcc -fast -Minfo laplace2d.c && ./a.out
2 main:
3 34, Loop not vectorized: data dependency
4 51, Generated an alternate version of the loop
5 Generated vector sse code for the loop
6 Generated 3 prefetch instructions for the loop
7 60, Memory copy idiom, loop replaced by call to __c_mcopy4
8 Jacobi relaxation computation: 4096 x 4096 mesh
9 0, 0.250000
10 100, 0.002397
11 200, 0.001204
12 300, 0.000804
13 400, 0.000603
14 500, 0.000483
15 600, 0.000403
16 700, 0.000345
17 800, 0.000302
18 900, 0.000269
19 Total time: 32.392654 s
El código es interesante, ¡Un solo código es OpenMP y OpenACC!
1 $ pgcc -fast -mp -Minfo laplace2d.c && OMP_NUM_THREADS=4 ./a.out
2 main:
3 34, Loop not vectorized: data dependency
4 50, Parallel region activated
5 Parallel loop activated with static block schedule
6 51, Generated an alternate version of the loop
7 Generated vector sse code for the loop
8 Generated 3 prefetch instructions for the loop
9 57, Barrier
10 Parallel region terminated
11 59, Parallel region activated
12 Parallel loop activated with static block schedule
13 60, Memory copy idiom, loop replaced by call to __c_mcopy4
14 63, Barrier
15 Parallel region terminated
16 Jacobi relaxation computation: 4096 x 4096 mesh
17 0, 0.249956
18 100, 0.002397
19 200, 0.001201
20 300, 0.000804
21 400, 0.000603
22 500, 0.000483
23 600, 0.000403
24 700, 0.000344
25 800, 0.000302
26 900, 0.000268
27 Total time: 30.492380 s
1 OMP_NUM_THREADS Totaltime
2 1 32.11
3 2 29.29
4 4 30.48
Esto anda muy mal, bastante peor de lo que registra Mark Harris.
Debe haber problemas de false sharing con la variable error
.
No le pusimos mucho entusiasmo a solucionarlo.
Con solo cambiar el switch de compilación alcanza.
1 $ pgcc -fast -acc -Minfo laplace2d.c && ./a.out
2 main:
3 34, Loop not vectorized: data dependency
4 46, Loop not vectorized/parallelized: potential early exits
5 49, Generating copyin(A[0:][0:])
6 Generating copyout(Anew[1:4094][1:4094])
7 Generating compute capability 1.0 binary
8 Generating compute capability 2.0 binary
9 50, Loop is parallelizable
10 51, Loop is parallelizable
11 Accelerator kernel generated
12 50, #pragma acc loop gang /* blockIdx.y */
13 51, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
14 CC 1.0 : 13 registers; 48 shared, 36 constant, 0 local memory bytes
15 CC 2.0 : 17 registers; 0 shared, 64 constant, 0 local memory bytes
16 54, Max reduction generated for error
17 58, Generating copyout(A[1:4094][1:4094])
18 Generating copyin(Anew[1:4094][1:4094])
19 Generating compute capability 1.0 binary
20 Generating compute capability 2.0 binary
21 59, Loop is parallelizable
22 60, Loop is parallelizable
23 Accelerator kernel generated
24 59, #pragma acc loop gang /* blockIdx.y */
25 60, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
26 CC 1.0 : 9 registers; 32 shared, 8 constant, 0 local memory bytes
27 CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes
28 Jacobi relaxation computation: 4096 x 4096 mesh
29 0, 0.250000
30 100, 0.002397
31 200, 0.001204
32 300, 0.000804
33 400, 0.000603
34 500, 0.000483
35 600, 0.000403
36 700, 0.000345
37 800, 0.000302
38 900, 0.000269
39 Total time: 85.691350 s
Usamos -ta=nvidia,time
en la compilación o nvprof
.
1 $ nvprof ./a.out
2 Jacobi relaxation computation: 4096 x 4096 mesh
3 ==10090== NVPROF is profiling process 10090, command: ./a.out
4 0, 0.250000
5 100, 0.002397
6 200, 0.001204
7 300, 0.000804
8 400, 0.000603
9 500, 0.000483
10 600, 0.000403
11 700, 0.000345
12 800, 0.000302
13 900, 0.000269
14 Total time: 97.484813 s
15 ==10090== Profiling application: ./a.out
16 ==10090== Profiling result:
17 Time(%) Time Calls Avg Min Max Name
18 53.09% 37.4887s 4159000 9.0130us 2.4490us 291.36us [CUDA memcpy HtoD]
19 42.58% 30.0665s 3000 10.022ms 2.3040us 30.313ms [CUDA memcpy DtoH]
20 2.67% 1.88225s 1000 1.8823ms 1.8806ms 1.8837ms main_51_gpu
21 1.32% 932.87ms 1000 932.87us 930.89us 935.21us main_60_gpu
22 0.34% 243.13ms 1000 243.13us 242.24us 244.32us main_54_gpu_red
23
24 ==10090== API calls:
25 Time(%) Time Calls Avg Min Max Name
26 47.32% 31.4789s 2000 15.739ms 14.886ms 31.013ms cuMemcpy2DUnaligned
27 30.21% 20.1015s 4159000 4.8330us 3.7080us 21.260ms cuMemcpyHtoDAsync
28 13.34% 8.87601s 64000 138.69us 2.1800us 1.0368ms cuEventSynchronize
29 3.21% 2.13362s 1000 2.1336ms 2.0004ms 2.3621ms cuMemcpyDtoHAsync
30 2.55% 1.69452s 4000 423.63us 2.6570us 1.9727ms cuCtxSynchronize
1 printf("Jacobi relaxation computation: %d x %d mesh\n", N, N);
2 double start = omp_get_wtime();
3 int iter = 0;
4 #pragma acc data copy(A, Anew)
5 while (tol<error && iter<ITER_MAX) {
6 ...
7 ...
8 if(iter%100==0) printf("%5d, %0.6f\n", iter, error);
9
10 ++iter;
11 }
12
13 printf("Total time: %f s\n", omp_get_wtime()-start);
14 }
La cláusula data copy(A, Anew)
pone los datos en el acelerador a la entrada y los saca a la salida.
1 $ pgcc -fast -acc laplace2d.c && ./a.out
2 Jacobi relaxation computation: 4096 x 4096 mesh
3 0, 0.250000
4 100, 0.002397
5 200, 0.001204
6 300, 0.000804
7 400, 0.000603
8 500, 0.000483
9 600, 0.000403
10 700, 0.000345
11 800, 0.000302
12 900, 0.000269
13 Total time: 3.523407 s
Mirando con la lupa de nvprof
.
1 Total time: 3.795374 s
2 ==25974== Profiling application: ./a.out
3 ==25974== Profiling result:
4 Time(%) Time Calls Avg Min Max Name
5 60.13% 1.88402s 1000 1.8840ms 1.8827ms 1.8851ms main_51_gpu
6 29.78% 933.10ms 1000 933.09us 930.98us 935.08us main_60_gpu
7 7.79% 244.00ms 1000 244.00us 243.01us 245.76us main_54_gpu_red
8 1.37% 42.868ms 1002 42.782us 2.6560us 25.338ms [CUDA memcpy DtoH]
9 0.93% 29.086ms 1128 25.785us 4.2240us 224.83us [CUDA memcpy HtoD]
10
11 ==25974== API calls:
12 Time(%) Time Calls Avg Min Max Name
13 59.75% 2.13858s 1000 2.1386ms 2.0301ms 2.4079ms cuMemcpyDtoHAsync
14 26.60% 951.99ms 4002 237.88us 1.2860us 1.0457ms cuCtxSynchronize
15 4.24% 151.83ms 1 151.83ms 151.83ms 151.83ms cuCtxCreate
sgemm
Obtuvimos un kernel que funciona casi tan bien como el mejor CUDA kernel que no usa shmem.
laplace2d
Luego de algunas modificaciones tenemos 30s/3.5s = 8.57x.
Con un mínimo esfuerzo.
(salvo tener los u$s759 que sale para que te congelen la versión en 1 año).
gcc
para OpenMP 4.0.#pragma omp simd
soportado en gcc-4.9
.Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |