Ways to SIMD

In [122]:
!rm -Rf tmp
!mkdir -p tmp

Inline Assembly

In [5]:
%%writefile tmp/inline-assembly.c

#include <x86intrin.h>
#include <stdio.h>

int main()
{
    float a[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    float b[8] = {7, 6, 5, 4, 3, 2, 1, 0};
    float result[8];

    __m256 avec = _mm256_loadu_ps(a);
    __m256 bvec = _mm256_loadu_ps(b);
    __m256 resultvec;
    
    asm ("vaddps %1, %2, %0"
        : "=x" (resultvec)
        : "x" (avec), "x" (bvec) /* "r" for normal registers, "x"
        : /* clobbers */
        );
    
    _mm256_storeu_ps(result, resultvec);
    
    for (int i = 0; i < 8; ++i)
        printf("%f ", result[i]);
    printf("\n");
}
Overwriting tmp/inline-assembly.c
In [6]:
!cd tmp; gcc -march=sandybridge -c -O inline-assembly.c
!objdump --disassemble tmp/inline-assembly.o
tmp/inline-assembly.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <main>:
   0:	4c 8d 54 24 08       	lea    0x8(%rsp),%r10
   5:	48 83 e4 e0          	and    $0xffffffffffffffe0,%rsp
   9:	41 ff 72 f8          	pushq  -0x8(%r10)
   d:	55                   	push   %rbp
   e:	48 89 e5             	mov    %rsp,%rbp
  11:	41 55                	push   %r13
  13:	41 54                	push   %r12
  15:	41 52                	push   %r10
  17:	53                   	push   %rbx
  18:	48 83 ec 70          	sub    $0x70,%rsp
  1c:	c7 45 b0 00 00 00 00 	movl   $0x0,-0x50(%rbp)
  23:	c5 fa 10 05 00 00 00 	vmovss 0x0(%rip),%xmm0        # 2b <main+0x2b>
  2a:	00 
  2b:	c5 fa 11 45 b4       	vmovss %xmm0,-0x4c(%rbp)
  30:	c5 fa 10 0d 00 00 00 	vmovss 0x0(%rip),%xmm1        # 38 <main+0x38>
  37:	00 
  38:	c5 fa 11 4d b8       	vmovss %xmm1,-0x48(%rbp)
  3d:	c5 fa 10 15 00 00 00 	vmovss 0x0(%rip),%xmm2        # 45 <main+0x45>
  44:	00 
  45:	c5 fa 11 55 bc       	vmovss %xmm2,-0x44(%rbp)
  4a:	c5 fa 10 1d 00 00 00 	vmovss 0x0(%rip),%xmm3        # 52 <main+0x52>
  51:	00 
  52:	c5 fa 11 5d c0       	vmovss %xmm3,-0x40(%rbp)
  57:	c5 fa 10 25 00 00 00 	vmovss 0x0(%rip),%xmm4        # 5f <main+0x5f>
  5e:	00 
  5f:	c5 fa 11 65 c4       	vmovss %xmm4,-0x3c(%rbp)
  64:	c5 fa 10 2d 00 00 00 	vmovss 0x0(%rip),%xmm5        # 6c <main+0x6c>
  6b:	00 
  6c:	c5 fa 11 6d c8       	vmovss %xmm5,-0x38(%rbp)
  71:	c5 fa 10 35 00 00 00 	vmovss 0x0(%rip),%xmm6        # 79 <main+0x79>
  78:	00 
  79:	c5 fa 11 75 cc       	vmovss %xmm6,-0x34(%rbp)
  7e:	c5 fa 11 75 90       	vmovss %xmm6,-0x70(%rbp)
  83:	c5 fa 11 6d 94       	vmovss %xmm5,-0x6c(%rbp)
  88:	c5 fa 11 65 98       	vmovss %xmm4,-0x68(%rbp)
  8d:	c5 fa 11 5d 9c       	vmovss %xmm3,-0x64(%rbp)
  92:	c5 fa 11 55 a0       	vmovss %xmm2,-0x60(%rbp)
  97:	c5 fa 11 4d a4       	vmovss %xmm1,-0x5c(%rbp)
  9c:	c5 fa 11 45 a8       	vmovss %xmm0,-0x58(%rbp)
  a1:	c7 45 ac 00 00 00 00 	movl   $0x0,-0x54(%rbp)
  a8:	c5 fc 28 7d b0       	vmovaps -0x50(%rbp),%ymm7
  ad:	c5 fc 28 4d 90       	vmovaps -0x70(%rbp),%ymm1
  b2:	c5 f4 58 c7          	vaddps %ymm7,%ymm1,%ymm0
  b6:	c5 fc 29 85 70 ff ff 	vmovaps %ymm0,-0x90(%rbp)
  bd:	ff 
  be:	48 8d 9d 70 ff ff ff 	lea    -0x90(%rbp),%rbx
  c5:	4c 8d 6b 20          	lea    0x20(%rbx),%r13
  c9:	4c 8d 25 00 00 00 00 	lea    0x0(%rip),%r12        # d0 <main+0xd0>
  d0:	c5 f9 57 c0          	vxorpd %xmm0,%xmm0,%xmm0
  d4:	c5 fa 5a 03          	vcvtss2sd (%rbx),%xmm0,%xmm0
  d8:	4c 89 e7             	mov    %r12,%rdi
  db:	b8 01 00 00 00       	mov    $0x1,%eax
  e0:	e8 00 00 00 00       	callq  e5 <main+0xe5>
  e5:	48 83 c3 04          	add    $0x4,%rbx
  e9:	4c 39 eb             	cmp    %r13,%rbx
  ec:	75 e2                	jne    d0 <main+0xd0>
  ee:	bf 0a 00 00 00       	mov    $0xa,%edi
  f3:	e8 00 00 00 00       	callq  f8 <main+0xf8>
  f8:	b8 00 00 00 00       	mov    $0x0,%eax
  fd:	48 83 c4 70          	add    $0x70,%rsp
 101:	5b                   	pop    %rbx
 102:	41 5a                	pop    %r10
 104:	41 5c                	pop    %r12
 106:	41 5d                	pop    %r13
 108:	5d                   	pop    %rbp
 109:	49 8d 62 f8          	lea    -0x8(%r10),%rsp
 10d:	c3                   	retq   

Intrinsics

In [125]:
%%writefile tmp/intrinsics.c

#include <x86intrin.h>
#include <stdio.h>

int main()
{
    float a[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    float b[8] = {7, 6, 5, 4, 3, 2, 1, 0};
    float result[8];

    __m256 avec = _mm256_loadu_ps(a);
    __m256 bvec = _mm256_loadu_ps(b);
    __m256 resultvec = _mm256_add_ps(avec, bvec);    
    _mm256_storeu_ps(result, resultvec);
    
    for (int i = 0; i < 8; ++i)
        printf("%f ", result[i]);
    printf("\n");
}
Writing tmp/intrinsics.c
In [126]:
!cd tmp; gcc -march=sandybridge -O intrinsics.c -ointrinsics
!./tmp/intrinsics
7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 

Vector types

In [129]:
%%writefile tmp/vector-types.c

#include <stdio.h>

int main()
{
    typedef float v8f __attribute__ ((vector_size (256/8)));
    v8f a = {0, 1, 2, 3, 4, 5, 6, 7};
    v8f b = {7, 6, 5, 4, 3, 2, 1, 0};

    v8f result = a+b+5;
    
    for (int i = 0; i < 8; ++i)
        printf("%f ", result[i]);
    printf("\n");
}
Overwriting tmp/vector-types.c
In [130]:
!cd tmp; gcc -march=sandybridge -O vector-types.c -ovector-types
!./tmp/vector-types
12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 
In [131]:
%%writefile tmp/vector-types.c

#include <stdio.h>

typedef float v8f __attribute__ ((vector_size (256/8)));

v8f add_vecs(v8f a, v8f b)
{
    return a+b+5;
}
Overwriting tmp/vector-types.c
In [132]:
!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o
tmp/vector-types.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <add_vecs>:
   0:	c5 fc 58 c1          	vaddps %ymm1,%ymm0,%ymm0
   4:	c5 fc 58 05 00 00 00 	vaddps 0x0(%rip),%ymm0,%ymm0        # c <add_vecs+0xc>
   b:	00 
   c:	c3                   	retq   
In [133]:
%%writefile tmp/vector-types.c

#include <stdio.h>

typedef float v8f __attribute__ ((vector_size (256/8)));
typedef int v8i __attribute__ ((vector_size (256/8)));

v8f reverse_vec(v8f a)
{
    //const v8i idx = {7, 6, 5, 4, 3, 2, 1, 0};
    const v8i idx = {3, 2, 1, 0, 7, 6, 5, 4};
    return __builtin_shuffle(a, idx);
}
Overwriting tmp/vector-types.c
In [134]:
!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o
tmp/vector-types.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <reverse_vec>:
   0:	c4 e3 7d 04 c0 1b    	vpermilps $0x1b,%ymm0,%ymm0
   6:	c3                   	retq   
In [135]:
%%writefile tmp/vector-types.c

#include <stdio.h>

typedef float v8f __attribute__ ((vector_size (256/8)));
typedef int v8i __attribute__ ((vector_size (256/8)));

v8f interleave(v8f a, v8f b)
{
    const v8i idx = {16, 15, 14, 13, 0, 1, 2, 3};
    return __builtin_shuffle(a, b, idx);
}
Overwriting tmp/vector-types.c
In [136]:
!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o
tmp/vector-types.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <interleave>:
   0:	c5 f8 28 d8          	vmovaps %xmm0,%xmm3
   4:	c5 f8 28 d0          	vmovaps %xmm0,%xmm2
   8:	c4 e3 7d 19 c9 01    	vextractf128 $0x1,%ymm1,%xmm1
   e:	c5 f0 c6 f1 ff       	vshufps $0xff,%xmm1,%xmm1,%xmm6
  13:	c5 f0 15 c1          	vunpckhps %xmm1,%xmm1,%xmm0
  17:	c5 f0 c6 c9 55       	vshufps $0x55,%xmm1,%xmm1,%xmm1
  1c:	c5 e0 c6 e3 55       	vshufps $0x55,%xmm3,%xmm3,%xmm4
  21:	c5 e0 15 eb          	vunpckhps %xmm3,%xmm3,%xmm5
  25:	c5 e0 c6 db ff       	vshufps $0xff,%xmm3,%xmm3,%xmm3
  2a:	c5 d0 14 eb          	vunpcklps %xmm3,%xmm5,%xmm5
  2e:	c5 e8 14 dc          	vunpcklps %xmm4,%xmm2,%xmm3
  32:	c5 f8 14 c9          	vunpcklps %xmm1,%xmm0,%xmm1
  36:	c5 e8 14 c6          	vunpcklps %xmm6,%xmm2,%xmm0
  3a:	c5 f8 16 c1          	vmovlhps %xmm1,%xmm0,%xmm0
  3e:	c5 e0 16 dd          	vmovlhps %xmm5,%xmm3,%xmm3
  42:	c4 e3 7d 18 c3 01    	vinsertf128 $0x1,%xmm3,%ymm0,%ymm0
  48:	c3                   	retq   

#pragma simd

In [137]:
%%writefile tmp/pragma-simd.c

#include <stdio.h>

typedef float vec_t[8];

void add_them(vec_t a, vec_t b, vec_t result)
{
    #pragma GCC ivdep
    // #pragma omp simd
    // #pragma omp simd aligned(a, b, result:32)
    for (int i = 0; i<8; ++i)
        result[i] = a[i] + b[i];
}
Writing tmp/pragma-simd.c
In [138]:
!cd tmp; gcc -march=sandybridge -fopenmp -O -c pragma-simd.c
#!cd tmp; gcc -march=sandybridge -fopenmp -O3 -c pragma-simd.c

!objdump --disassemble tmp/pragma-simd.o
tmp/pragma-simd.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <add_them>:
   0:	b8 00 00 00 00       	mov    $0x0,%eax
   5:	c5 fa 10 04 07       	vmovss (%rdi,%rax,1),%xmm0
   a:	c5 fa 58 04 06       	vaddss (%rsi,%rax,1),%xmm0,%xmm0
   f:	c5 fa 11 04 02       	vmovss %xmm0,(%rdx,%rax,1)
  14:	48 83 c0 04          	add    $0x4,%rax
  18:	48 83 f8 20          	cmp    $0x20,%rax
  1c:	75 e7                	jne    5 <add_them+0x5>
  1e:	c3                   	retq   

Semantics (openMP 4.5, 2.8.1):

The simd construct enables the execution of multiple iterations of the associated loops concurrently by means of SIMD instructions.

A SIMD loop has logical iterations numbered 0,1,...,N-1 where N is the number of loop iterations, and the logical numbering denotes the sequence in which the iterations would be executed if the associated loop(s) were executed with no SIMD instructions. If the safelen clause is used then no two iterations executed concurrently with SIMD instructions can have a greater distance in the logical iteration space than its value. The parameter of the safelen clause must be a constant positive integer expression. If used, the simdlen clause specifies the preferred number of iterations to be executed concurrently.

  • What does safelen(12) mean?

The parameter of the simdlen clause must be a constant positive integer. The number of iterations that are executed concurrently at any given time is implementation defined. Each concurrent iteration will be executed by a different SIMD lane. Each set of concurrent iterations is a SIMD chunk. Lexical forward dependencies in the iterations of the original loop must be preserved within each SIMD chunk.

  • What does simdlen(16) mean?
  • How do safelen and simdlen relate to each other

Also:

  • #pragma omp declare simd on functions
  • with inbranch, notinbranch

Scalar Program Instances

In [120]:
%%writefile tmp/scalar-program-instances.ispc

float add_them(float a, float b)
{
    return a + b;
}
Overwriting tmp/scalar-program-instances.ispc
In [121]:
!cd tmp; ~/pack/ispc-v1.9.0-linux/ispc  \
    --target=avx2-i32x8 \
    --opt=force-aligned-memory \
    scalar-program-instances.ispc \
    -o scalar-program-instances.o
! objdump --disassemble tmp/scalar-program-instances.o
tmp/scalar-program-instances.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <add_them___vyfvyf>:
   0:	c5 fc 58 c1          	vaddps %ymm1,%ymm0,%ymm0
   4:	c3                   	retq   
  • uniform and varying
  • programCount and programIndex
  • x16 targets... why?
  • How do you think shuffles are done?
  • Compare this with omp declare simd