Nicolás Wolovick 20200415
¿Estrategias de paralelismo de grano fino? aka
¿Cómo usamos vectores para sumar?
[1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f]
[1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f]
vdpps
parece overkill, debe tener una latencia grande (si, la tiene).
Me quedo con vhaddps
. Idea: haddps(haddps(v,v), haddps(v,v))
1 v = [v3, v2, v1, v0]
2
3 hadd(v,v)
4
5 r = [v2+v3, v0+v1, v2+v3, v0+v1]
6
7 hadd(r,r)
8
9 [v2+v3+v0+v1, v2+v3+v0+v1, v2+v3+v0+v1, v2+v3+v0+v1]
Tengo dos mitades de 128b con el sum broadcast en todos sus lanes.
reducesum_vhadd
1 for (unsigned int i=0; i<N; i+=8) {
2 __m256 v = _mm256_load_ps(&a[i]);
3
4 __m256 psum = _mm256_hadd_ps(_mm256_hadd_ps(v,v), _mm256_hadd_ps(v,v));
5
6 s += _mm_cvtss_f32(_mm_add_ps(_mm256_extractf128_ps(psum,0), _mm256_extractf128_ps(psum,1)));
7 }
1 __m256 vsum = {0.0f};
2 for (unsigned int i=0; i<N; i+=8) {
3 __m256 v = _mm256_load_ps(&a[i]);
4 vsum = _mm256_add_ps(vsum, v);
5 }
6 // { vsum[j] = \sum{a[i] : 0≤i<N, i%8 = j}, 0≤j<8 }
7 for (unsigned int i=0; i<8; ++i)
8 s += vsum[i]; // idiom para acceder a las coords!
9 return (int)s;
{gcc-10, clang-9}
x -O{1,2,3}
x {\emptyset, -ftree-vectorize
} x -march={haswell,knl}
clang
tiene un mejor modelo de operaciones vectoriales que gcc
.-ftree-vectorize
de un código SSE con -march=haswell
y que genere AVX2.1 for (unsigned int i=0; i<N; ++i) {
2 s += a[i];
3 }
--ffast-math
, no puede vectorizar, pero genera código vectorial ¯_(ツ)_/¯vaddss
con muchos shuffles al medio.--ffast-math
hace peine.-march=
genera código SIMD de 128, 256 y 512 bits.Con N=(1<<30)
en un Haswell E5-2620v3:
vhadd
: 0.32hadd
: 0.17add
con autovect: 0.17Resumen vhadd
es una porquería.
Scatter/Gathers internos.
unpack
shuffle
/permute
blend
Transpose an 8x8 float using AVX/AVX2, Stack Overflow, 2015.
1 void tran(float* mat, float* matT) {
2 __m256 r0, r1, r2, r3, r4, r5, r6, r7;
3 __m256 t0, t1, t2, t3, t4, t5, t6, t7;
4
5 r0 = _mm256_load_ps(&mat[0*8]);
6 r1 = _mm256_load_ps(&mat[1*8]);
7 r2 = _mm256_load_ps(&mat[2*8]);
8 r3 = _mm256_load_ps(&mat[3*8]);
9 r4 = _mm256_load_ps(&mat[4*8]);
10 r5 = _mm256_load_ps(&mat[5*8]);
11 r6 = _mm256_load_ps(&mat[6*8]);
12 r7 = _mm256_load_ps(&mat[7*8]);
13
14 t0 = _mm256_unpacklo_ps(r0, r1);
15 t1 = _mm256_unpackhi_ps(r0, r1);
16 t2 = _mm256_unpacklo_ps(r2, r3);
17 t3 = _mm256_unpackhi_ps(r2, r3);
18 t4 = _mm256_unpacklo_ps(r4, r5);
19 t5 = _mm256_unpackhi_ps(r4, r5);
20 t6 = _mm256_unpacklo_ps(r6, r7);
21 t7 = _mm256_unpackhi_ps(r6, r7);
22
23 r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));
24 r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));
25 r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));
26 r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));
27 r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));
28 r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));
29 r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));
30 r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));
31
32 t0 = _mm256_permute2f128_ps(r0, r4, 0x20);
33 t1 = _mm256_permute2f128_ps(r1, r5, 0x20);
34 t2 = _mm256_permute2f128_ps(r2, r6, 0x20);
35 t3 = _mm256_permute2f128_ps(r3, r7, 0x20);
36 t4 = _mm256_permute2f128_ps(r0, r4, 0x31);
37 t5 = _mm256_permute2f128_ps(r1, r5, 0x31);
38 t6 = _mm256_permute2f128_ps(r2, r6, 0x31);
39 t7 = _mm256_permute2f128_ps(r3, r7, 0x31);
40
41 _mm256_store_ps(&matT[0*8], t0);
42 _mm256_store_ps(&matT[1*8], t1);
43 _mm256_store_ps(&matT[2*8], t2);
44 _mm256_store_ps(&matT[3*8], t3);
45 _mm256_store_ps(&matT[4*8], t4);
46 _mm256_store_ps(&matT[5*8], t5);
47 _mm256_store_ps(&matT[6*8], t6);
48 _mm256_store_ps(&matT[7*8], t7);
49 }
Para L=2^14
, transponer
Trivial:
gcc-10 -O1 -march=haswell mtxtransp1.c && perf stat -d -r 4 ./a.out
3.82s
Blocking manual de 1 nivel: memoria + cache
gcc-10 -O1 -march=haswell mtxtransp1.c && perf stat -d -r 4 ./a.out
1.52s
Blocking manual de 2 niveles: memoria + cache + registros AVX
gcc-10 -O1 -march=haswell transpose_mem_cache_avx.c && perf stat -d -r 4 ./a.out
0.51s
Ejemplo muy claro de locality is performance.
Stan Melax, 3D Vector Normalization Using 256-Bit Intel® Advanced Vector Extensions (Intel® AVX), 2010.
gcc -O1 -ftree-vectorize
#pragma acc parallel loop
#pragma omp simd
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |