sgemv
, buscando superlinear speedup.Nicolás Wolovick, 20140429
sections
El constructor sections
se compila a un case
.
1 #pragma omp parallel sections num_threads(4)
2 {
3 #pragma omp section
4 {
5 printf("1st lexical section, tid %d\n", omp_get_thread_num());
6 }
7 #pragma omp section
8 {
9 printf("2nd lexical section, tid %d\n", omp_get_thread_num());
10 }
11 }
Una primera aproximación: GCC 4.6.3 GNU OpenMP Manual, Implementing Sections Construct.
sections
, expansión de gccgcc -fopenmp -fdump-tree-ompexp parallel-sections.c
Gracias Diego Novillo!
1 main() {
2 ...
3 GOMP_parallel_sections_start (main._omp_fn.0, 0B, 4, 2);
4 main._omp_fn.0 (0B);
5 GOMP_parallel_end ();
6 ...
7 }
8 ...
9 main._omp_fn.0 (void * .omp_data_i) {
10 for(i=GOMP_sections_start(2); i!=0; i=GOMP_sections_next()) {
11 switch (i) {
12 case 2:
13 printf("2nd lexical section, tid %d\n", omp_get_thread_num())
14 break;
15 case 1:
16 printf("1st lexical section, tid %d\n", omp_get_thread_num());
17 break;
18 case 0:
19 GOMP_sections_end_nowait();
20 break;
21 }
22 }
23 }
single
1 #pragma omp parallel
2 {
3 printf("Parallel section, tid %d\n", omp_get_thread_num());
4 #pragma omp single
5 {
6 printf("Single section, tid %d\n", omp_get_thread_num());
7 }
8 }
Obtenemos una transformación temprana de gcc
: -fdump-tree-omplower
.
single
, expansión de gccgcc -fopenmp -fdump-tree-omplower parallel-single.c
1 #pragma omp parallel [child fn: main._omp_fn.0 (???)]
2 {
3 printf("Parallel section, tid %d\n", omp_get_thread_num());
4 _Bool D.2133;
5
6 #pragma omp single
7 D.2133 = GOMP_single_start();
8 if (D.2133 == 1) goto <D.2131>; else goto <D.2132>;
9 <D.2131>:
10 printf("Single section, tid %d\n", omp_get_thread_num());
11 <D.2132>:
12 #pragma omp return
13 }
14 #pragma omp return
15 }
1 unsigned int i = 0;
2 #pragma omp parallel for
3 for(i=0; i<N; ++i) {
4 a[i] *= a[i];
5 }
GCC 4.6.3 GNU OpenMP Manual, Implementing the for Construct
gcc -fopenmp -fdump-tree-ompexp parallel-for.c
1 main._omp_fn.0 (void * .omp_data_i)
2 {
3 unsigned int chunk_size = DIV_CEIL(1048576, omp_get_num_threads());
4 unsigned int tid = omp_get_thread_num();
5 lower = tid * chunk_size;
6 upper = MIN((tid+1)*chunk_size, 1048576);
7 for(i=lower; i<upper; ++i) {
8 a[i] *= a[i];
9 }
10
11 return;
12 }
Tuve que reconstruir el código porque -fdump-tree-omplower
no mostraba nada y -fdump-tree-ompexp
ya tenía todo desenrollado en instrucciones de 3 operandos.
schedule(dynamic, 32)
1 main._omp_fn.0 (void * .omp_data_i)
2 {
3 unsigned int start, end;
4 while (GOMP_loop_dynamic_next(&start, &end)) {
5 for(i=start; i<end; ++i) {
6 a[i] *= a[i];
7 }
8
9 return;
10 }
De nuevo reconstruido a partir de -fdump-tree-ompexp
.
Notar que era algo más sencillo que un threadpool.
for
con reduction
1 ...
2 unsigned int i = 0;
3 float sum = 0.0f;
4 #pragma omp parallel for reduction(+:sum)
5 for(i=0; i<N; i++) {
6 sum += a[i];
7 }
8 ...
Esta vez usamos una pasada más temprana (-fdump-tree-omplower
) para ver como transforma.
reduction
gcc -fopenmp -fdump-tree-omplower parallel-reduction.c
1 i = 0;
2 sum = 0.0;
3 {
4 .omp_data_o.1.sum = sum;
5 {
6 #pragma omp parallel reduction(+:sum) [child fn: main._omp_fn.0 (.omp_data_o.1)]
7 {
8 .omp_data_i = &.omp_data_o.1;
9 sum = 0.0;
10 {
11 unsigned int i;
12
13 #pragma omp for nowait private(i)
14 for (i = 0; i <= 1048575; i = i + 1)
15 sum += a[i];
16 #pragma omp continue (i, i)
17 #pragma omp return(nowait)
18 }
19 D.2136 = &.omp_data_i->sum;
20 #pragma omp atomic_load
21 D.2135 = *D.2136
22 D.2137 = D.2135 + sum;
23 #pragma omp atomic_store (D.2137)
24 #pragma omp return
25 }
26 }
27 sum = .omp_data_o.1.sum;
28 }
El par
1 #pragma omp atomic_load
2 D.2135 = *D.2136
3 D.2137 = D.2135 + sum;
4 #pragma omp atomic_store (D.2137)
Se implementa con load-linked
/store-conditional
,
una implementación de concurrencia optimista que suele estar en µProcesadores RISC.
Para x86 se usa compare&exchange: LOCK CMPXCHG
.
(Using OpenMP, Capítulo 5)
nowait
.ordered
.master
vs. single
.taskset
. Distribuir memoria en procesadores: numactl
.One may be worried about the creation of new threads within the inner loop. Worry not, the
libgomp
in GCC is smart enough to actually only creates the threads once. Once the team has done its work, the threads are returned into a "dock", waiting for new work to do. In other words, the number of times the clone system call is executed is exactly equal to the maximum number of concurrent threads. The parallel directive is not the same as a combination ofpthread_create
andpthread_join
.There will be lots of locking/unlocking due to the implied barriers, though. I don't know if that can be reasonably avoided or whether it even should.
(Joel Yliluoma, Guide into OpenMP)
Luego el ejemplo de Using OpenMP..., Fig 5.24 no es tan grave.
De todas maneras puede mejorar la localidad de los hilos y con eso el reuso de las caché locales.
1 #pragma omp parallel
2 {
3 printf("1st parallel, tid %d\n", omp_get_thread_num());
4 }
5
6 printf("In the middle, tid %d\n", omp_get_thread_num());
7
8 #pragma omp parallel
9 {
10 printf("2nd parallel, tid %d\n", omp_get_thread_num());
11 }
Vemos cuantos clone
llama:
1 $ OMP_NUM_THREADS=4 strace ./a.out 2>&1 | grep clone
2 clone(child_stack=0x7f930eed2f70, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f930eed39d0, tls=0x7f930eed3700, child_tidptr=0x7f930eed39d0) = 6318
3 clone(child_stack=0x7f930e6d1f70, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f930e6d29d0, tls=0x7f930e6d2700, child_tidptr=0x7f930e6d29d0) = 6319
4 clone(child_stack=0x7f930ded0f70, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f930ded19d0, tls=0x7f930ded1700, child_tidptr=0x7f930ded19d0) = 6320
Efectivamente GOMP_parallel_start()
y GOMP_parallel_end()
dejan los hilos "anclados" para reutilizarlos.
Luego el costo está en las barreras y/o en la migración de hilos a otras CPUs.
Gráficos de: Reid, Bull, OpenMP Microbenchmarks Version 2.0, 2004. Para Sun Fire 15K.
(8 procesadores)
(8 procesadores)
(Arreglo de 729 elementos)
Joseph Harkness, Extending the EPCC OpenMP Microbenchmarks for OpenMP 3.0, University of Edinburgh, 2010.
sgemv
(Using OpenMP, p.162)
sgemv
paralelo 1 #include <stdio.h>
2 #include <omp.h>
3
4 #ifndef N
5 #define N 1024
6 #endif
7
8 float a[N][N], b[N], c[N];
9
10 int main(void)
11 {
12 unsigned int i = 0, j = 0;
13 double start = 0.0;
14
15 start = omp_get_wtime();
16 #pragma omp parallel for default(none) shared(start,a,b,c) private(i,j)
17 for (i=0; i<N; ++i)
18 for (j=0; j<N; ++j)
19 c[i] += a[i][j]*b[j];
20 printf("%f", ((long)N*N*3*sizeof(float))/((1<<30)*(omp_get_wtime()-start)));
21
22 return 0;
23 }
Compilación y ejecución
1 gcc -O3 -fopenmp $PROG.c -o $PROG -DN="$n"
2 OMP_NUM_THREADS=$t taskset 0x0000000F numactl --interleave=all ./$PROG
mini
, performance1 Intel Core i7-950@3.07GHz (4 cores, 8 hilos), 16GB DDR3 1066MHz.
mini
, eficienciaganesh
, performance4 AMD Opteron 8212@2.0GHz (2 cores), 4*8GB.
ganesh
, eficienciaThis somewhat surprising result is of course specific to the algorithm, implementation, system, and software used and would not be possible if the code were not so amenable to compiler analysis. The lesson to be learned from this study is that, for important program regions, both experimentation and analysis are needed. We hope that the insights given here are sufficient for a programmer to get started on this process.
(Using OpenMP, p.190)
(M. Süß, C. Leopold, Common Mistakes in OpenMP and How To Avoid Them, 2006.)
#pragma opm parallel for
.private
.{ ... }
.1 #pragma omp parallel
2 #pragma omp atomic
3 sum += a[omp_num_threads()];
4 ++a[omp_num_threads()];
Luego de la semana y media sin clase (1ro Mayo, 3EAGPGPU).
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |