Nicolás Wolovick 20180417
Single Instruction Multiple Data
Nos vamos a concentrar en una versión particular.
SSSE3
c = a*b
1 #define N (1<<25)
2 float a[N], b[N], c[N];
3 int main(void) {
4 for (unsigned int i=0; i<N; ++i) {
5 a[i] = b[i]*c[i];
6 }
7 }
Compilamos y ejecutamos.
1 $ gcc-8 -O1 multmap.c && perf stat -e cycles,instructions,cache-references,cache-misses -r 16 ./a.out
2
3 Performance counter stats for './a.out' (16 runs):
4
5 299,568,090 cycles ( +- 1.65% ) (49.31%)
6 205,716,053 instructions # 0.69 insn per cycle ( +- 0.12% ) (74.60%)
7 4,496,172 cache-referenc ( +- 0.35% ) (75.37%)
8 311,783 cache-misses # 6.934 % of all cache refs ( +- 4.00% ) (75.31%)
9
10 0.116081396 seconds time elapsed ( +- 1.62% )
Tiene un sencillo paralelismo de grano fino. ¡Qué lo descubra el compilador!
1 $ gcc-8 -O1 -ftree-vectorize -fopt-info-vec multmap.c && perf stat -e cycles,instructions,cache-references,cache-misses -r 16 ./a.out
2 multmap.c:6:2: note: loop vectorized
3
4 Performance counter stats for './a.out' (16 runs):
5
6 333,899,596 cycles ( +- 0.52% ) (49.62%)
7 117,847,864 instructions # 0.35 insn per cycle ( +- 0.55% ) (74.90%)
8 3,859,603 cache-references ( +- 0.38% ) (75.08%)
9 470,081 cache-misses # 12.180 % of all cache refs ( +- 0.92% ) (75.30%)
10
11 0.127995673 seconds time elapsed ( +- 0.37% )
-fopt-info-vec
para ver si pudo vectorizar.-fopt-info-vec-missed
para ver que NO pudo vectorizar.-O3
incluye vectorización, pero también muchas más cosas que dificultan la lectura del assembler.-ftree-vectorizer-verbose=2
para gcc-4.8
y menores.
1 N SIMD cycles instr walltime
2 (1<<25) no 370M 292M 0.14s
3 (1<<25) SI 355M 142M 0.13s
4 (1<<26) no 740M 583M 0.28s
5 (1<<26) SI 695M 280M 0.26s
6 (1<<27) no 1463M 1171M 0.56s
7 (1<<27) SI 1383M 566M 0.52s
Código absolutamente memory-bound con intensidad aritmética de 1 FLOP/ 8 bytes.
Aun asi, mejora un poquitín leer ancho.
Mostafa Hagog, Looking for 4x speedups? SSE™ to the rescue!, Intel, 2006.
gcc-8 -S -O1 multmap.c
1 .L2:
2 movss (%rcx,%rax), %xmm0
3 mulss (%rdx,%rax), %xmm0
4 movss %xmm0, (%rsi,%rax)
5 addq $4, %rax
6 cmpq $134217728, %rax
7 jne .L2
gcc-8 -S -O1 -ftree-vectorize multmap.c
1 .L2:
2 movaps (%rcx,%rax), %xmm0
3 mulps (%rdx,%rax), %xmm0
4 movaps %xmm0, (%rsi,%rax)
5 addq $16, %rax
6 cmpq $134217728, %rax
7 jne .L2
Notar:
%rax
va 4 veces más rápido (repite lazo 4x menos). ps
: packed single. xmm{0..15}
__m128
: 4 flotantes.__m128d
: 2 dobles.__m128i
: 16 bytes, 8 shorts, 4 ints, 2 longs. 1 #include <mmintrin.h> // MMX
2 #include <xmmintrin.h> // SSE
3 #include <emmintrin.h> // SSE2
4 #include <pmmintrin.h> // SSE3 (Prescott)
5 #include <tmmintrin.h> // SSSE3 (Tejas)
6 #include <smmintrin.h> // SSE4.1
7 #include <nmmintrin.h> // SSE4.2 (Nehalem)
8 #include <ammintrin.h> // SSE4A
9 #include <wmmintrin.h> // AES (Westmere)
10 #include <immintrin.h> // AVX, AVX2
11 #include <zmmintrin.h> // AVX512
ó, como se hace de manera moderna
1 #include <x86intrin.h> // el que sea necesario
Y si estás en ARM
1 #include <arm_neon.h>
Copiar un valor a las 4 componentes.
1 __m128 _mm_set1_ps (float a)
Establece las 4 componentes.
1 __m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
1 #include <xmmintrin.h>
2 #include <stdio.h>
3
4 __m128 ta;
5
6 void main(void) {
7 __m128 tb = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
8 ta = _mm_add_ps(ta, tb);
9 printf("%f %f %f %f\n", ta[0], ta[1], ta[2], ta[3]);
10 }
Ejecutamos
1 $ gcc-8 -O1 store_load.c && ./a.out
2 1.000000 2.000000 3.000000 4.000000
Notar: __m128 v
se puede acceder como si fuera float v[4]
.
1 __m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr) // movlps
2 void _mm_store_ps (float* mem_addr, __m128 a) // movaps
Más
1 __m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr) // movhps
2 __m128d _mm_loadh_pd (__m128d a, double const* mem_addr) // movhpd
3 void _mm_store_pd (double* mem_addr, __m128d a) // movapd
Este intrinsic se mapea a una instrucción movaps
.
must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Para solucionar esto __m128 _mm_loadu_ps (float const* mem_addr)
does not need to be aligned on any particular boundary.
_mm_load_ps
_mm_add_ps
En este caso tenemos dos: movss + shufps
.
1 __m128 _mm_movelh_ps (__m128 a, __m128 b) // movlhps
1 __m128 _mm_unpacklo_ps (__m128 a, __m128 b) // unpcklps
También está _mm_movehl_ps
(movhlps
).
1 __m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm) // shufps
unpcklps
, unpckhps
con shufps
1 __m128 _mm_add_ps (__m128 a, __m128 b) // addps
2 __m128 _mm_sub_ps (__m128 a, __m128 b) // subps
3 __m128 _mm_mul_ps (__m128 a, __m128 b) // mulps
4 __m128 _mm_min_ps (__m128 a, __m128 b) // minps
5 __m128 _mm_max_ps (__m128 a, __m128 b) // maxps
Comparación add
vs. mul
1 __m128 _mm_div_ps (__m128 a, __m128 b) // divps
2 __m128 _mm_rcp_ps (__m128 a) // rcpps
3 __m128 _mm_sqrt_ps (__m128 a) // sqrtps
4 __m128 _mm_rsqrt_ps (__m128 a) // rsqrtps
¿Cuán caras son?
div
y rcp
en diferentes arquitecturasLa latencia y el throughput cambian con el tiempo, cuac!
1 __m128 _mm_cmpeq_ps (__m128 a, __m128 b) // cmpps
2 __m128 _mm_cmpneq_ps (__m128 a, __m128 b) // cmpps
3 __m128 _mm_cmplt_ps (__m128 a, __m128 b) // cmpps
4 __m128 _mm_cmpnlt_ps (__m128 a, __m128 b) // cmpps
1 FOR j := 0 to 3
2 i := j*32
3 dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0
4 ENDFOR
1 __m128 _mm_and_ps (__m128 a, __m128 b) // andps
2 __m128 _mm_or_ps (__m128 a, __m128 b) // orps
3 __m128 _mm_xor_ps (__m128 a, __m128 b) // xorps
4 __m128 _mm_andnot_ps (__m128 a, __m128 b) // andnps
1 int _mm_cvtss_si32 (__m128 a) // cvtss2si
2 __m128 _mm_cvtsi32_ss (__m128 a, int b) // cvtsi2ss
3 __m64 _mm_cvtps_pi32 (__m128 a) // cvtps2pi
4 __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) // cvtpi2ps
Y otras cosas sencillas como tomar el primer elemento y devolver un float
.
1 float _mm_cvtss_f32 (__m128 a) // movss
1 void _mm_prefetch (char const* p, int i) // prefetchnta, prefetcht0, prefetcht1, prefetcht2
1 Hint value Instruction Semantics
2 _MM_HINT_T0 prefetcht0 L1+L2+L3
3 _MM_HINT_T1 prefetcht1 L2+L3
4 _MM_HINT_T2 prefetcht2 L3
5 _MM_HINT_NTA prefetchtnta Non temporal data
These non-temporal write operations do not read a cache line and then modify it; instead, the new content is directly written to memory.
Ver también _mm_stream_ps()
.
1 void _mm_lfence (void) // lfence
2 void _mm_sfence (void) // sfence
3 void _mm_mfence (void) // mfence
1 __m128 _mm_addsub_ps (__m128 a, __m128 b)
Input: { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
Output: { A0 - B0, A1 + B1, A2 - B2, A3 + B3 }
1 __m128 _mm_hsub_ps (__m128 a, __m128 b)
Input: { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
Output: { A0 - A1, A2 - A3, B0 - B1, B2 - B3 }
1 __m128 _mm_hadd_ps (__m128 a, __m128 b)
Input: { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
Output: { A0 + A1, A2 + A3, B0 + B1, B2 + B3 }
¿Cómo sumo las 4 compontentes de un __m128
?
1 _MM_TRANSPOSE4_PS (__m128 row0, __m128 row1, __m128 row2, __m128 row3)
Necesita 8 instrucciones y 4 registros.
_mm_shuffle_ps
1 _MM_SHUFFLE(z, y, x, w)
2 // expands to the following value (z<<6) | (y<<4) | (x<<2) | w
Broadcast el elemento 1 del vector, aka _mm_load_ps1
.
1 _mm_shuffle_ps(v, v, _MM_SHUFFLE(1,1,1,1))
1 __m128 _mm_blend_ps (__m128 a, __m128 b, const int imm) // blendps
if
sencillo.1 for (i=0; i<N; ++i) {
2 if (a[i]<b[i])
3 c[i]=a[i]*b[i];
4 else
5 c[i]=a[i];
6 }
blend
1 for (i=0; i<N; i+=4){
2 A = _mm_load_ps(&a[i]);
3 B = _mm_load_ps(&b[i]);
4 C = _mm_mul_ps (A, B);
5 mask = _mm_cmplt_ps (A, B);
6 C = _mm_blend_ps (C, A, mask);
7 _mm_store_ps (&c[i], C);
8 }
1 __m128 _mm_dp_ps (__m128 a, __m128 b, const int imm) // dpps
¿Cómo se hace un producto punto en SSE3?
.notes, lo último estaba en http://software.intel.com/en-us/articles/how-to-implement-a-horizontal-addsubtract-with-streaming-simd-extensions-3-instructions)
1 DP(a[127:0], b[127:0], imm8[7:0])
2 FOR j := 0 to 3
3 i := j*32
4 IF imm8[(4+j)%8]
5 temp[i+31:i] := a[i+31:i] * b[i+31:i]
6 ELSE
7 temp[i+31:i] := 0
8 FI
9 ENDFOR
10 sum[31:0] := (temp[127:96] + temp[95:64])
11 + (temp[63:32] + temp[31:0])
12 FOR j := 0 to 3
13 i := j*32
14 IF imm8[j%8]
15 tmpdst[i+31:i] := sum[31:0]
16 ELSE
17 tmpdst[i+31:i] := 0
18 FI
19 ENDFOR
20 RETURN tmpdst[127:0]
21 dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
1 __m128 _mm_insert_ps (__m128 a, __m128 b, const int imm) // insertps
2 int _mm_extract_ps (__m128 a, const int imm) // extractps
Escribe/lee elementos individualmente.
Anterior a SSE4.1, muchas instrucciones de shuffling.
_mm256_i32gather_epi32
1 FOR j := 0 to 7
2 i := j*32
3 dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
4 ENDFOR
5 dst[MAX:256] := 0
_mm256_i32scatter_ps
1 FOR j := 0 to 7
2 i := j*32
3 MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
4 ENDFOR
multmap
a manoEl código armado a mano con intrinsics de C
.
1 #include <xmmintrin.h>
2 #define N (1<<25)
3 float a[N], b[N], c[N];
4
5 void main(void) {
6 for (unsigned int i=0; i<N; i+=4) {
7 __m128 ta = _mm_load_ps(&a[i]);
8 __m128 tb = _mm_load_ps(&b[i]);
9 __m128 tc = _mm_mul_ps(ta, tb);
10 _mm_store_ps(&c[i], tc);
11 }
12 }
Podríamos haber hecho un one-liner con los operadores prefijos en vez de infijos.
1 for (unsigned int i=0; i<N; i+=4)
2 _mm_store_ps(&c[i], _mm_mul_ps(_mm_load_ps(&a[i]), _mm_load_ps(&b[i])));
Pero quiero ser como René Lavand:
"no se puede hacer más lento"
1 $ gcc-8 -O1 multmap_quadload.c && perf stat -e cycles,instructions,cache-references,cache-misses -r 3 ./a.out
2
3 Performance counter stats for './a.out' (3 runs):
4
5 345,358,935 cycles ( +- 1.46% ) (49.10%)
6 140,062,939 instructions # 0.41 insn per cycle ( +- 0.34% ) (75.33%)
7 3,818,735 cache-references ( +- 2.37% ) (75.96%)
8 396,785 cache-misses # 10.390 % of all cache refs ( +- 1.96% ) (74.94%)
9
10 0.135106507 seconds time elapsed ( +- 2.02% )
Funciona igual que el vectorizador.
1 .L2:
2 movaps (%rcx,%rax), %xmm0
3 mulps (%rdx,%rax), %xmm0
4 movaps %xmm0, (%rsi,%rax)
5 addq $16, %rax
6 cmpq $134217728, %rax
7 jne .L2
¿Se puede auto-vectorizar un código con intrinsics a vectores más anchos?
NO :(
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |