Nicolás Wolovick, $Date: 2012-04-12 19:49:47 -0300 (Thu, 12 Apr 2012) $, $Revision: 3396 $
cs = map (uncurry (*)) (zip as bs)
1 int main(void) {
2 unsigned int i=0;
3 for (i=0; i<N; ++i) {
4 a[i] = b[i]*c[i];
5 }
6 return 0;
7 }
gcc -O2
1 Performance counter stats for './multmap' (4 runs):
2
3 100,830,044 instructions:u # 1.18 insns per cycle ( +- 0.00% )
4 85,465,952 cycles:u # 0.000 GHz ( +- 0.01% )
5 110.614890 cpu-clock:u ( +- 1.93% )
6
7 0.111786299 seconds time elapsed ( +- 2.30% )
gcc -O2 -S
1 .L2:
2 movss b(%rax), %xmm0
3 mulss c(%rax), %xmm0
4 movss %xmm0, a(%rax)
5 addq $4, %rax
6 cmpq $67108864, %rax
7 jne .L2
1 int main(void) {
2 unsigned int i=0;
3 for (i=0; i<N; ++i) {
4 a[i] = b[i]*c[i];
5 }
6 return 0;
7 }
gcc -O3
1 Performance counter stats for './multmap' (4 runs):
2
3 25,332,566 instructions:u # 0.45 insns per cycle ( +- 0.00% )
4 56,890,583 cycles:u # 0.000 GHz ( +- 0.03% )
5 95.927188 cpu-clock:u ( +- 0.58% )
6
7 0.096696426 seconds time elapsed ( +- 0.74% )
gcc -O3 -S
1 .L2:
2 movaps b(%rax), %xmm0
3 mulps c(%rax), %xmm0
4 movaps %xmm0, a(%rax)
5 addq $16, %rax
6 cmpq $67108864, %rax
7 jne .L2
f32
a la vez.mova
ps
, mul
ps
es la clave.addq $16, %rax
.These extensions were originally called subword parallelism or vector. Since Intel marketing used SIMD to describe the MMX extension of the 80x86 announced in 1996, that became the popular name, due in part to a successful television advertising campaign involving disco dancers wearing clothing mod- eled after the cleansuits worn in semiconductor fabrication lines.
Publicidad de 1997 para Pentium II con extensiones MMX.
(Hennessy, Patterson, CAAQA5, L-48)
(©2007, Intel)
(©2007, Intel)
Memory-to-register/register-to-memory/register-to-register data movement Scalar– MOVSS Packed – MOVAPS, MOVUPS, MOVLPS, MOVHPS, MOVLHPS, MOVHLPS
Arithmetic Scalar – ADDSS, SUBSS, MULSS, DIVSS, RCPSS, SQRTSS, MAXSS, MINSS, RSQRTSS Packed – ADDPS, SUBPS, MULPS, DIVPS, RCPPS, SQRTPS, MAXPS, MINPS, RSQRTPS
Compare Scalar – CMPSS, COMISS, UCOMISS Packed – CMPPS
Data shuffle and unpacking Packed – SHUFPS, UNPCKHPS, UNPCKLPS
Data-type conversion Scalar – CVTSI2SS, CVTSS2SI, CVTTSS2SI Packed – CVTPI2PS, CVTPS2PI, CVTTPS2PI
Bitwise logical operations Packed – ANDPS, ORPS, XORPS, ANDNPS
...
Cache and Memory management ... MOVNTQ, MOVNTPS, MASKMOVQ, PREFETCH0, PREFETCH1, PREFETCH2, PREFETCHNTA, SFENCE
MOV
<alineamiento>
<alcance>
<precisión>
MOVAPS
<operación>
<alcance>
<precisión>
RSQRTPS
ADDPD
solo a partir de SSE2.
Memory-to-register/register-to-memory/register-to-register data movement
Scalar– MOVSS
Packed – MOVAPS, MOVUPS, MOVLPS, MOVHPS, MOVLHPS, MOVHLPS
Diferencia entre scalar
y packed
.
Acá está el 4x publicitado por Intel.
No hay trascendentales! sin
, cos
, etc.
shufps
: Shuffle Parallel Scalarsshufps op1, op2, op3
op1
: 4x f32.
op2
: 4x f32.
op3
: contains an 8-bit map dd:cc:bb:aa
(MSB to LSB).
1 op1[0] = op1[aa]
2 op1[1] = op1[bb]
3 op1[2] = op2[cc]
4 op1[3] = op2[dd]
Broadcast
Swap
unpcklps
, unpckhps
Ejemplo. Meter 4 f32 desperdigados por la memoria.
Versiones de f32 -> int que redondean o truncan.
f64
, usando d como sufijo: addpd
.XMM
.haddps
(Horizontal-Add-Packed-Single)
Input: { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
Output: { A0 + A1, A2 + A3, B0 + B1, B2 + B3 }
hsubps
...
addsubps
— (Add-Subtract-Packed-Single)
Input: { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
Output: { A0 − B0, A1 + B1, A2 − B2, A3 + B3 }
movddup
, movshdup
, movsldu
Extensiones con un caso de uso típico y específico.
(©2007, Intel)
(hay un SSSE3 en Core 2 Merom, aka mi máquina, pero lo olvidemos)
Cosas increíblemente específicas: mpsadbw
y phminposuw
.
Motion Estimation with Intel® Streaming SIMD Extensions 4 (Intel® SSE4)
Otras no tanto ...
dpps
.blendvps
.insertps
, extractps
.popcnt
.xmm
!(mi máquina no es SSE4, grep sse4 /proc/cpuinfo
)
dpps
Milagroooo! El superarchiusado producto punto.
(©2007, Intel)
blend
Selección condicional de componentes.
Aparea perfecto con las comparaciones packed. Para compilar flujos condicionales SIMD.
1 for (i=0; i<N; i++)
2 if (a[i]<b[i]) c[i]=a[i]*b[i];
3 else c[i]=a[i];
(©2007, Intel)
insert
, extract
shufps
.scatter
y gather
de MPI.f32
de xmm0
en los 4 índices que indica xmm1
.ymm
.c = a+b
.ymm
en cambios de contexto!Forma sencilla de programar en SSE sin tener que usar inline assembler.
_mm_sqrt_ss(_mm_dp_ps(v, v, 0xF1))
._mm_mul_ps
es mulps
._mm_load1_ps
es movss
+shufps
._MM_TRANSPOSE4_PS()
: traspone una matriz 4x4._MM_SHUFFLE()
: arma el paquetito de 8 bits para el shufps
.__m128
: 4-way f32
.__m128d
: 2-way f64
.__m128i
: 16-way i8
, 8-way i16
, 4-way i32
, 2-way i64
. Todos están alineados a 16 bytes.
.
struct point p[N] __attribute__((aligned(16)));
_mm_malloc()
y _mm_free()
.xmmintrin.h
.emmintrin.h
.pmmintrin.h
.tmmintrin.h
.smmintrin.h
y nmmintrin.h
.immintrin.h
._mm_<op>_<suffix>
1 #define N (1<<25)
2
3 struct point {
4 float x; float y; float z; float w;
5 };
6
7 struct point p[N];
8
9 int main(void) {
10 float result = 0.0f;
11 unsigned int i = 0;
12 for(i=0; i<N; ++i) {
13 float dst = 0.0;
14 dst = sqrtf((p[i].x * p[i].x) + (p[i].y * p[i].y) + (p[i].z * p[i].z));
15 if (dst>result)
16 result = dst;
17 }
18 return (int)result;
19 }
clang -O3 -S
1 pxor %xmm0, %xmm0
2 xorl %eax, %eax
3 .align 16, 0x90
4 .LBB0_1: # =>This Inner Loop Header: Depth=1
5 movss p+4(%rax), %xmm2
6 mulss %xmm2, %xmm2
7 movss p(%rax), %xmm1
8 mulss %xmm1, %xmm1
9 addss %xmm2, %xmm1
10 movss p+8(%rax), %xmm2
11 mulss %xmm2, %xmm2
12 addss %xmm1, %xmm2
13 sqrtss %xmm2, %xmm1
14 addq $16, %rax
15 cmpl $536870912, %eax # imm = 0x20000000
16 maxss %xmm0, %xmm1
17 movaps %xmm1, %xmm0
18 jne .LBB0_1
19 # BB#2:
20 cvttss2si %xmm1, %eax
21 ret
1 #include <math.h>
2 #include <smmintrin.h>
3 #include <nmmintrin.h>
4
5 #define N (1<<25)
6
7 struct point {
8 float x; float y; float z; float w;
9 };
10
11 struct point p[N] __attribute__((aligned(16)));
12
13 int main(void) {
14 float result = 0.0f;
15 unsigned int i = 0;
16 for(i=0; i<N; ++i) {
17 __m128 v;
18 float dst = 0.0;
19 v = _mm_load_ps((const float *)&p[i]);
20 dst = _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(v, v, 0xF1)));
21 if (dst>result)
22 result = dst;
23 }
24 return (int)result;
25 }
clang -O3 -S -msse4
1 # BB#0:
2 pxor %xmm0, %xmm0
3 xorl %eax, %eax
4 .align 16, 0x90
5 .LBB0_1: # =>This Inner Loop Header: Depth=1
6 movdqa p(%rax), %xmm1
7 dpps $241, %xmm1, %xmm1
8 sqrtss %xmm1, %xmm1
9 addq $16, %rax
10 cmpl $536870912, %eax # imm = 0x20000000
11 maxss %xmm0, %xmm1
12 movaps %xmm1, %xmm0
13 jne .LBB0_1
14 # BB#2:
15 cvttss2si %xmm1, %eax
16 ret
-msse4
.movdqa
en vez de movaps
. Ver stackoverflow.float
a int
. 1 Performance counter stats for './maxdist1' (3 runs):
2
3 470,010,759 instructions:u # 1.36 insns per cycle ( +- 0.00% )
4 346,061,128 cycles:u # 0.000 GHz ( +- 0.01% )
5 202.498834 cpu-clock:u ( +- 0.60% )
6
7 0.203088584 seconds time elapsed ( +- 0.65% )
8
9 ./maxdist2: Illegal instruction
10 ./maxdist2: Illegal instruction
11 ./maxdist2: Illegal instruction
12
13 Performance counter stats for './maxdist2' (3 runs):
14
15 116,921 instructions:u # 0.41 insns per cycle ( +- 0.00% )
16 283,259 cycles:u # 0.000 GHz ( +- 5.26% )
17 0.752611 cpu-clock:u ( +- 3.03% )
18
19 1.050753614 seconds time elapsed ( +- 88.53% )
Ahora sobre un Nehalem Core i7 950.
1 Performance counter stats for './maxdist1' (3 runs):
2
3 470,022,798 instructions:u # 1.21 insns per cycle ( +- 0.00% )
4 389,822,430 cycles:u # 0.000 GHz ( +- 0.64% )
5 153.160468 cpu-clock:u ( +- 0.71% )
6
7 0.153685433 seconds time elapsed ( +- 0.63% )
8
9 Performance counter stats for './maxdist2' (3 runs):
10
11 268,696,203 instructions:u # 0.78 insns per cycle ( +- 0.00% )
12 344,465,420 cycles:u # 0.000 GHz ( +- 0.64% )
13 137.200226 cpu-clock:u ( +- 0.77% )
14
15 0.137595709 seconds time elapsed ( +- 0.81% )
icc -fast
me sigue ganando (308,079,974 cycles:u, 126.149909 cpu-clock).dpps
. Es ilegible.rsqrtps
, 1/√x: +rápida, -precisa. Aun le gano a -O3
.Basado en ideas de:
1 int main(void) {
2 unsigned int i=0, j=0, k=0;
3 for(i=0; i<L; ++i) {
4 float vin[4] = {p[i].x, p[i].y, p[i].z, p[i].w};
5 float vout[4] = {0.0f, 0.0f, 0.0f, 0.0f};
6 for (j=0; j<4; ++j)
7 for (k=0; k<4; ++k) {
8 vout[j] += vin[k]*mtx[k][j];
9 }
10 p[i].x = vout[0]; p[i].y = vout[1]; p[i].z = vout[2]; p[i].w = vout[3];
11 }
12 return (int)p[(int)p[0].x].y;
13 }
1 int main(void) {
2 unsigned int i=0, j=0;
3 __m128 row[4];
4 for (j=0; j<4; ++j)
5 row[j] = _mm_load_ps((const float *)&mtx[j][0]);
6 _MM_TRANSPOSE4_PS(row[0], row[1], row[2], row[3]);
7 for(i=0; i<L; ++i) {
8 __m128 vin = _mm_load_ps((const float *)&p[i]);
9 __m128 xxxx = _mm_shuffle_ps(vin, vin, _MM_SHUFFLE(0, 0, 0, 0));
10 __m128 yyyy = _mm_shuffle_ps(vin, vin, _MM_SHUFFLE(1, 1, 1, 1));
11 __m128 zzzz = _mm_shuffle_ps(vin, vin, _MM_SHUFFLE(2, 2, 2, 2));
12 __m128 wwww = _mm_shuffle_ps(vin, vin, _MM_SHUFFLE(3, 3, 3, 3));
13 __m128 c0 = _mm_mul_ps(row[0], xxxx);
14 __m128 c1 = _mm_mul_ps(row[1], yyyy);
15 __m128 c2 = _mm_mul_ps(row[2], zzzz);
16 __m128 c3 = _mm_mul_ps(row[3], wwww);
17 c0 = _mm_add_ps(c0,c1);
18 c0 = _mm_add_ps(c0,c2);
19 c0 = _mm_add_ps(c0,c3);
20 _mm_store_ps((float *)&p[i], c0);
21 }
22 return (int)p[(int)p[0].x].y;
23 }
Idea, operar horizontalmente: Optimizing for SSE: A Case Study.
1 x' = a1 x + a2 y + a3 z + a4 w
2 y' = b1 x + b2 y + b3 z + b4 w
3 z' = c1 x + c2 y + c3 z + c4 w
4 w' = d1 x + d2 y + d3 z + d4 w
clang -O3 -S
1 .LBB0_1: # =>This Inner Loop Header: Depth=1
2 movdqa p(%rax), %xmm4
3 pshufd $85, %xmm4, %xmm5 # xmm5 = xmm4[1,1,1,1]
4 mulps %xmm2, %xmm5
5 pshufd $0, %xmm4, %xmm6 # xmm6 = xmm4[0,0,0,0]
6 mulps %xmm1, %xmm6
7 addps %xmm5, %xmm6
8 pshufd $-86, %xmm4, %xmm5 # xmm5 = xmm4[2,2,2,2]
9 mulps %xmm0, %xmm5
10 addps %xmm6, %xmm5
11 pshufd $-1, %xmm4, %xmm4 # xmm4 = xmm4[3,3,3,3]
12 mulps %xmm3, %xmm4
13 addps %xmm5, %xmm4
14 movaps %xmm4, p(%rax)
15 addq $16, %rax
16 cmpl $1073741824, %eax # imm = 0x40000000
17 jne .LBB0_1
shufps
usa la versión entera pshufd
.movdqa
,
el almacenamiento con la operación de flotantes movaps
.clang
pone comentarios útiles! 1 int main(void) {
2 unsigned int i=0, j=0;
3 __m128 row[4];
4 for (j=0; j<4; ++j)
5 row[j] = _mm_load_ps((const float *)&mtx[j][0]);
6 _MM_TRANSPOSE4_PS(row[0], row[1], row[2], row[3]);
7 for(i=0; i<L; ++i) {
8 __m128 vin = _mm_load_ps((const float *)&p[i]);
9 __m128 m0 = _mm_mul_ps(row[0], vin);
10 __m128 m1 = _mm_mul_ps(row[1], vin);
11 __m128 m2 = _mm_mul_ps(row[2], vin);
12 __m128 m3 = _mm_mul_ps(row[3], vin);
13 m0 = _mm_hadd_ps(m0,m1);
14 m2 = _mm_hadd_ps(m2,m3);
15 m0 = _mm_hadd_ps(m0,m2);
16 _mm_store_ps((float *)&p[i], m0);
17 }
18 return (int)p[(int)p[0].x].y;
19 }
affine2.c
para que pueda subir un poco más un haddps
.clang -O3 -S
1 .LBB0_1: # =>This Inner Loop Header: Depth=1
2 movaps p(%rax), %xmm4
3 movaps %xmm3, %xmm6
4 mulps %xmm4, %xmm6
5 movaps %xmm0, %xmm5
6 mulps %xmm4, %xmm5
7 haddps %xmm6, %xmm5
8 movaps %xmm2, %xmm6
9 mulps %xmm4, %xmm6
10 mulps %xmm1, %xmm4
11 haddps %xmm6, %xmm4
12 haddps %xmm5, %xmm4
13 movapd %xmm4, p(%rax)
14 addq $16, %rax
15 cmpl $1073741824, %eax # imm = 0x40000000
16 jne .LBB0_1
1 int main(void) {
2 unsigned int i=0, j=0;
3 __m128 row[4];
4 for (j=0; j<4; ++j)
5 row[j] = _mm_load_ps((const float *)&mtx[j][0]);
6 _MM_TRANSPOSE4_PS(row[0], row[1], row[2], row[3]);
7 for(i=0; i<L; ++i) {
8 __m128 vin = _mm_load_ps((const float *)&p[i]);
9 __m128 x = _mm_dp_ps(vin, row[0], 0xF1);
10 __m128 y = _mm_dp_ps(vin, row[1], 0xF1);
11 __m128 z = _mm_dp_ps(vin, row[2], 0xF1);
12 __m128 w = _mm_dp_ps(vin, row[3], 0xF1);
13 __m128 yyxx = _mm_shuffle_ps(x, y, _MM_SHUFFLE(0, 0, 0, 0));
14 __m128 wwzz = _mm_shuffle_ps(z, w, _MM_SHUFFLE(0, 0, 0, 0));
15 __m128 wzyx = _mm_shuffle_ps(yyxx, wwzz, _MM_SHUFFLE(0, 2, 0, 2));
16 _mm_store_ps((float *)&p[i], wzyx);
17 }
18 return (int)p[(int)p[0].x].y;
19 }
clang -O3 -S
1 .LBB0_1: # =>This Inner Loop Header: Depth=1
2 movdqa p(%rax), %xmm4
3 movdqa %xmm4, %xmm6
4 dpps $241, %xmm3, %xmm6
5 movdqa %xmm4, %xmm5
6 dpps $241, %xmm0, %xmm5
7 movlhps %xmm6, %xmm5 # xmm5 = xmm5[0],xmm6[0]
8 movdqa %xmm4, %xmm6
9 dpps $241, %xmm2, %xmm6
10 dpps $241, %xmm1, %xmm4
11 movlhps %xmm6, %xmm4 # xmm4 = xmm4[0],xmm6[0]
12 shufps $34, %xmm5, %xmm4 # xmm4 = xmm4[2,0],xmm5[2,0]
13 movaps %xmm4, p(%rax)
14 addq $16, %rax
15 cmpl $1073741824, %eax # imm = 0x40000000
16 jne .LBB0_1
movlhps
para los dos primeros shuffles.gcc -O3 -msse4
1 Performance counter stats for './affine1' (32 runs):
2 4362695905 instructions:u # 2.24 insns per cycle ( +- 0.00% )
3 1950681803 cycles:u # 0.000 GHz ( +- 0.12% )
4 1.219008493 seconds time elapsed ( +- 0.06% )
5
6 Performance counter stats for './affine2' (32 runs):
7 1074361464 instructions:u # 1.28 insns per cycle ( +- 0.00% )
8 841857473 cycles:u # 0.000 GHz ( +- 0.03% )
9 0.850169705 seconds time elapsed ( +- 0.03% )
10
11 Performance counter stats for './affine3' (32 runs):
12 1409774734 instructions:u # 1.33 insns per cycle ( +- 0.00% )
13 1056038805 cycles:u # 0.000 GHz ( +- 0.01% )
14 0.783233684 seconds time elapsed ( +- 0.01% )
15
16 Performance counter stats for './affine4' (32 runs):
17 1007252615 instructions:u # 0.96 insns per cycle ( +- 0.00% )
18 1046711587 cycles:u # 0.000 GHz ( +- 0.00% )
19 0.917531781 seconds time elapsed ( +- 0.01% )
20
21 Performance counter stats for './affine5' (32 runs):
22 1007252632 instructions:u # 0.85 insns per cycle ( +- 0.00% )
23 1184640540 cycles:u # 0.000 GHz ( +- 0.01% )
24 0.963644925 seconds time elapsed ( +- 0.01% )
affine1
: versión CPU, affine2
: versión SSE, affine3
: versión SSE con (x,y,z)
y lecturas desalineadas,
affine4
versión SSE3, affine4
versión SSE4.
Core 2 Duo E8400@3.0GHz (Penryn), gcc-4.6.3
.
clang -O3 -msse4
1 Performance counter stats for './affine1' (32 runs):
2 4362695901 instructions:u # 2.23 insns per cycle ( +- 0.00% )
3 1954863628 cycles:u # 0.000 GHz ( +- 0.14% )
4 1.219930304 seconds time elapsed ( +- 0.07% )
5
6 Performance counter stats for './affine2' (32 runs):
7 1074361463 instructions:u # 1.28 insns per cycle ( +- 0.00% )
8 841577852 cycles:u # 0.000 GHz ( +- 0.03% )
9 0.849091893 seconds time elapsed ( +- 0.02% )
10
11 Performance counter stats for './affine3' (32 runs):
12 1409774737 instructions:u # 1.33 insns per cycle ( +- 0.00% )
13 1056083672 cycles:u # 0.000 GHz ( +- 0.01% )
14 0.782958326 seconds time elapsed ( +- 0.01% )
15
16 Performance counter stats for './affine4' (32 runs):
17 1007252618 instructions:u # 0.96 insns per cycle ( +- 0.00% )
18 1046782419 cycles:u # 0.000 GHz ( +- 0.00% )
19 0.917322801 seconds time elapsed ( +- 0.01% )
20
21 Performance counter stats for './affine5' (32 runs):
22 1007252629 instructions:u # 0.85 insns per cycle ( +- 0.00% )
23 1184423044 cycles:u # 0.000 GHz ( +- 0.01% )
24 0.963353887 seconds time elapsed ( +- 0.01% )
affine1
: versión CPU, affine2
: versión SSE, affine3
: versión SSE con (x,y,z)
y lecturas desalineadas,
affine4
versión SSE3, affine4
versión SSE4.
Core 2 Duo E8400@3.0GHz (Penryn), clang-3.0
.
icc -fast -xSSE4.1
1 Performance counter stats for './affine1' (32 runs):
2 3490212439 instructions:u # 1.46 insns per cycle ( +- 0.00% )
3 2396883102 cycles:u # 0.000 GHz ( +- 0.30% )
4 1.365796813 seconds time elapsed ( +- 0.18% )
5
6 Performance counter stats for './affine2' (32 runs):
7 1409837530 instructions:u # 1.40 insns per cycle ( +- 0.00% )
8 1008044002 cycles:u # 0.000 GHz ( +- 0.01% )
9 0.903477817 seconds time elapsed ( +- 0.01% )
10
11 Performance counter stats for './affine3' (32 runs):
12 1946577417 instructions:u # 1.39 insns per cycle ( +- 0.00% )
13 1402357115 cycles:u # 0.000 GHz ( +- 0.03% )
14 0.897205653 seconds time elapsed ( +- 0.02% )
15
16 Performance counter stats for './affine4' (32 runs):
17 1141402088 instructions:u # 1.04 insns per cycle ( +- 0.00% )
18 1094408805 cycles:u # 0.000 GHz ( +- 0.01% )
19 0.931928832 seconds time elapsed ( +- 0.01% )
20
21 Performance counter stats for './affine5' (32 runs):
22 1141402106 instructions:u # 0.86 insns per cycle ( +- 0.00% )
23 1325499866 cycles:u # 0.000 GHz ( +- 0.00% )
24 1.008774232 seconds time elapsed ( +- 0.01% )
affine1
: versión CPU, affine2
: versión SSE, affine3
: versión SSE con (x,y,z)
y lecturas desalineadas,
affine4
versión SSE3, affine4
versión SSE4.
Core 2 Duo E8400@3.0GHz (Penryn), icc-12.1.3
.
gcc -O3
vectoriza automáticamente affine1.c
y clang -O3
no.dpps
conviene en realidad? Parece que no, o al menos en Penryn no es tan bueno.icc
es el más lento.Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |