SSE/AVX matrix multiply @rygorous https://gist.github.com/rygorous/4172889 ### GCC 7 [nwolovick@eulogia SIMD]$ for i in {0..3}; do echo "-O$i"; gcc -O$i -march=knl gistfile1.cpp; ./a.out ; done -O0 all ok. ref: 1079.34 cycles SSE: 448.48 cycles AVX_4mem: 464.23 cycles AVX_8: 317.98 cycles -O1 all ok. ref: 154.28 cycles SSE: 45.19 cycles AVX_4mem: 83.21 cycles AVX_8: 71.94 cycles -O2 all ok. ref: 132.61 cycles SSE: 36.71 cycles AVX_4mem: 71.07 cycles AVX_8: 64.14 cycles -O3 all ok. ref: 11.28 cycles SSE: 38.62 cycles AVX_4mem: 72.81 cycles AVX_8: 62.41 cycles ### GCC 5.4 nicolasw@zx81:~/SIMD$ for i in {0..3}; do echo "-O$i"; gcc -O$i -march=haswell gistfile1.cpp; ./a.out ; done -O0 all ok. ref: 239.35 cycles SSE: 151.48 cycles AVX_4mem: 153.38 cycles AVX_8: 105.78 cycles -O1 all ok. ref: 60.46 cycles SSE: 19.60 cycles AVX_4mem: 18.24 cycles AVX_8: 11.47 cycles -O2 all ok. ref: 53.20 cycles SSE: 16.61 cycles AVX_4mem: 15.81 cycles AVX_8: 10.23 cycles -O3 all ok. ref: 44.26 cycles SSE: 14.60 cycles AVX_4mem: 15.77 cycles AVX_8: 9.73 cycles ### GCC 7 nicolasw@zx81:~/SIMD$ for i in {0..3}; do echo "-O$i"; gcc-7 -O$i -march=haswell gistfile1.cpp; ./a.out ; done -O0 all ok. ref: 240.48 cycles SSE: 150.43 cycles AVX_4mem: 153.92 cycles AVX_8: 110.06 cycles -O1 all ok. ref: 61.81 cycles SSE: 19.60 cycles AVX_4mem: 18.24 cycles AVX_8: 12.30 cycles -O2 all ok. ref: 51.76 cycles SSE: 16.86 cycles AVX_4mem: 15.77 cycles AVX_8: 11.28 cycles -O3 all ok. ref: 10.45 cycles SSE: 14.87 cycles AVX_4mem: 15.57 cycles AVX_8: 9.33 cycles ### GCC 5.4 nwolovick@mendieta23 SIMD]$ ./0; ./1; ./2; ./3 all ok. ref: 293.09 cycles SSE: 175.70 cycles AVX_4mem: 179.26 cycles AVX_8: 165.44 cycles all ok. ref: 73.46 cycles SSE: 25.81 cycles AVX_4mem: 22.12 cycles AVX_8: 17.25 cycles all ok. ref: 72.72 cycles SSE: 24.57 cycles AVX_4mem: 23.18 cycles AVX_8: 16.67 cycles all ok. ref: 68.44 cycles SSE: 24.35 cycles AVX_4mem: 20.28 cycles AVX_8: 17.29 cycles