SIMD instructions in GCC

Presentations

SIMD instructions in GCC

Early CPU architectures

How to make such beast faster?

images/cpu1.png

SIMD

Typical SIMD operations

An example - image manipulations

Lenna1 Lenna2 Lenna3

SIMD on x86(64)

MMX         1996
3DNow!      1998
SSE         1999
SSE2        2001
SSE3        2004
SSSE3       2006
SSE4        2006
SSE5        2007
AVX         2008
F16C        2009
XOP         2009
FMA4        2011
FMA3        2012
AVX2        2013
AVX-512     2015
AMX         2022

SIMD or RISC CPUs

MAX-1       Multimedia Acceleration eXtensions v1   HP-PA RISC
MAX-2       Multimedia Acceleration eXtensions v2   HP-PA RISC
VIS 1       Visual Instruction v1                   Set SPARC V9
VIS 2       Visual Instruction v2                   Set SPARC V9
AltiVec     (obchodní názvy Velocity Engine, VMX)   PowerPC
MDMX        MIPS Digital Media eXtension (MaDMaX)   MIPS
MIPS-3D     MIPS-3D                                 MIPS
MVI         Motion Video Instructions               DEC Alpha
NEON        Advanced SIMD                           Cortex (ARMv7, ARMv8)
Packed SIMD Packed SIMD                             RISC-V
Vector Set  Vector Set                              RISC-V
SVE         Scalable Vector Extension               ARMv8.2-A and newer

Babylon? Chaos?

Solutions?

Vector extension (GCC)

Practical part

“special” typedefs

#include <stdio.h>

typedef unsigned short int v16us __attribute__((vector_size(16)));

int main(void)
{
    printf("scalar: %ld bytes\n", sizeof(unsigned short int));
    printf("vector: %ld bytes\n", sizeof(v16us));

    return 0;
}

Vector size, number of elements

#include <stdio.h>

typedef unsigned char v16ub __attribute__((vector_size(16)));
typedef unsigned short int v16us __attribute__((vector_size(16)));
typedef unsigned int v16ui __attribute__((vector_size(16)));
typedef unsigned long int v16ul __attribute__((vector_size(16)));

int main(void)
{
    printf("unsigned char:  %ld bytes\n", sizeof(unsigned char));
    printf("unsigned short: %ld bytes\n", sizeof(unsigned short int));
    printf("unsigned int:   %ld bytes\n", sizeof(unsigned int));
    printf("unsigned long:  %ld bytes\n", sizeof(unsigned long int));

    printf("vector unsigned char:  %ld bytes\n", sizeof(v16ub));
    printf("vector unsigned short: %ld bytes\n", sizeof(v16us));
    printf("vector unsigned int:   %ld bytes\n", sizeof(v16ui));
    printf("vector unsigned long:  %ld bytes\n", sizeof(v16ul));

    return 0;
}

Vector add

#include <stdio.h>

typedef signed char v16ub __attribute__((vector_size(16)));
typedef signed short int v16us __attribute__((vector_size(16)));
typedef signed int v16ui __attribute__((vector_size(16)));
typedef signed long int v16ul __attribute__((vector_size(16)));

int main(void)
{
    {
        v16ub x = { 1, 2, 3, 4, 5, 6, 7, 8 };
        v16ub y = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
        v16ub z = x + y;
    }

    {
        v16us x = { 1, 2, 3, 4, 5, 6, 7, 8 };
        v16us y = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
        v16us z = x + y;
    }

    {
        v16ui x = { 1, 2, 3, 4 };
        v16ui y = { 0xff, 0xff, 0xff, 0xff };
        v16ui z = x + y;
    }

    {
        v16ul x = { 1, 2 };
        v16ul y = { 0xff, 0xff };
        v16ul z = x + y;
    }


    return 0;
}

Accessing vector items

#include <stdio.h>

typedef unsigned short int v16us __attribute__((vector_size(16)));

int main(void)
{
    v16us x = { 1, 2, 3, 4, 5, 6, 7, 8 };
    v16us y = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
    v16us z = x + y;

    int i;

    for (i = 0; i < 8; i++) {
        printf("%d %d\n", i, z[i]);
    }

    return 0;
}

Vector is not an array!

typedef float v1024f __attribute__((vector_size(1024)));

void addVectors(v1024f * x, v1024f * y, v1024f * z)
{
    *z = *x + *y;
}

int main(void)
{
    v1024f x = { 1.0 };
    v1024f y = { 1.0 };
    v1024f z;

    addVectors(&x, &y, &z);

    return 0;
}

Let’s look into assembly

#include <stdio.h>

typedef signed char v16ib __attribute__((vector_size(16)));

void add16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x + y;
}

void sub16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x - y;
}

void mul16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x * y;
}

void div16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x / y;
}

void mod16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x % y;
}

void and16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x & y;
}

void or16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x | y;
}

void xor16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x ^ y;
}

void rshift16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x >> y;
}

void lshift16ib(v16ib x, v16ib y, v16ib * z)
{
    *z = x << y;
}

void print_vectors(const char *message, const char *op, v16ib * x,
                   v16ib * y, v16ib * z)
{
    int i;

    puts(message);
    for (i = 0; i < sizeof(v16ib) / sizeof(signed char); i++) {
        printf("%2d    %d %s %d = %d\n", i, (*x)[i], op, (*y)[i], (*z)[i]);
    }

    putchar('\n');
}

int main(void)
{
    v16ib x;
    v16ib y;
    v16ib z;
    int i;

    for (i = 0; i < sizeof(v16ib) / sizeof(signed char); i++) {
        x[i] = i * 2;
        y[i] = 16 - i;
    }

    add16ib(x, y, &z);
    print_vectors("vector addition", "+", &x, &y, &z);

    sub16ib(x, y, &z);
    print_vectors("vector subtraction", "-", &x, &y, &z);

    mul16ib(x, y, &z);
    print_vectors("vector multiply", "*", &x, &y, &z);

    div16ib(x, y, &z);
    print_vectors("vector divide", "/", &x, &y, &z);

    mod16ib(x, y, &z);
    print_vectors("vector modulo", "%", &x, &y, &z);

    and16ib(x, y, &z);
    print_vectors("vector bitwise and", "&", &x, &y, &z);

    or16ib(x, y, &z);
    print_vectors("vector bitwise or", "|", &x, &y, &z);

    xor16ib(x, y, &z);
    print_vectors("vector bitwise xor", "^", &x, &y, &z);

    rshift16ib(x, y, &z);
    print_vectors("vector right shift", ">>", &x, &y, &z);

    lshift16ib(x, y, &z);
    print_vectors("vector left shift", "<<", &x, &y, &z);

    return 0;
}

Floats and doubles

#include <stdio.h>

typedef float v16float __attribute__((vector_size(16)));

void add16float(v16float x, v16float y, v16float * z)
{
    *z = x + y;
}

void sub16float(v16float x, v16float y, v16float * z)
{
    *z = x - y;
}

void mul16float(v16float x, v16float y, v16float * z)
{
    *z = x * y;
}

void div16float(v16float x, v16float y, v16float * z)
{
    *z = x / y;
}

void print_vectors(const char *message, const char op, v16float * x,
                   v16float * y, v16float * z)
{
    int i;

    puts(message);
    for (i = 0; i < sizeof(v16float) / sizeof(float); i++) {
        printf("%2d    %5.3f %c %5.3f = %5.3f\n", i, (*x)[i], op, (*y)[i],
               (*z)[i]);
    }

    putchar('\n');
}

int main(void)
{
    v16float x;
    v16float y;
    v16float z;
    int i;

    for (i = 0; i < sizeof(v16float) / sizeof(float); i++) {
        x[i] = i;
        y[i] = i + 0.1;
    }

    add16float(x, y, &z);
    print_vectors("vector addition", '+', &x, &y, &z);

    sub16float(x, y, &z);
    print_vectors("vector subtraction", '-', &x, &y, &z);

    mul16float(x, y, &z);
    print_vectors("vector multiply", '*', &x, &y, &z);

    div16float(x, y, &z);
    print_vectors("vector divide", '/', &x, &y, &z);

    return 0;
}