NUMA and Bandwidths

In [1]:
!rm -Rf tmp
!mkdir -p tmp

Gathering Information

In [7]:
%%writefile tmp/numa-info.c

#define _GNU_SOURCE
#include <numa.h>
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <omp.h>
#include <assert.h>
#include "timing.h"


void print_bitmask(const struct bitmask *bm)
{
  for(size_t i=0; i<bm->size; ++i)
    printf("%d", numa_bitmask_isbitset(bm, i));
}


int main(int argc, const char **argv)
{
  int num_cpus = numa_num_task_cpus();
  printf("num cpus: %d\n", num_cpus);

  printf("numa available: %d\n", numa_available());
  numa_set_localalloc();

  struct bitmask *bm = numa_bitmask_alloc(num_cpus);
  for (int i=0; i<=numa_max_node(); ++i)
  {
    numa_node_to_cpus(i, bm);
    printf("numa node %d ", i);
    print_bitmask(bm);
    printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.));
  }
  numa_bitmask_free(bm);

  return 0;
}
Overwriting tmp/numa-info.c
In [8]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-info.c -onuma-info
! ./tmp/numa-info
num cpus: 16
numa available: 0
numa node 0 1111111100000000 - 31.3159 GiB
numa node 1 0000000011111111 - 31.4982 GiB

A Shared Header

In [3]:
%%writefile tmp/numatest.h

#define _GNU_SOURCE
#include <numa.h>
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <omp.h>
#include <assert.h>
#include "timing.h"

void pin_to_core(size_t core)
{
  cpu_set_t cpuset;
  CPU_ZERO(&cpuset);
  CPU_SET(core, &cpuset);
  pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
}

double measure_access(void *x, size_t array_size, size_t ntrips)
{
  timestamp_type t1;
  get_timestamp(&t1);

  for (size_t i = 0; i<ntrips; ++i)
    for(size_t j = 0; j<array_size; ++j)
    {
      *(((char*)x) + ((j * 1009) % array_size)) += 1;
    }

  timestamp_type t2;
  get_timestamp(&t2);

  return timestamp_diff_in_seconds(t1, t2);
}
Writing tmp/numatest.h

On- and Off-Node Bandwidths

In [10]:
%%writefile tmp/numa-bw-seq.c

#include "numatest.h"

int main(int argc, const char **argv)
{
  int num_cpus = numa_num_task_cpus();
  numa_set_localalloc();

  char *x;
  const size_t cache_line_size = 64;
  const size_t array_size = 100*1000*1000;
  size_t ntrips = 2;

#pragma omp parallel
  {
    assert(omp_get_num_threads() == num_cpus);
    int tid = omp_get_thread_num();

    pin_to_core(tid);
    if(tid == 0)
      x = (char *) numa_alloc_local(array_size);

    // {{{ single access
#pragma omp barrier
    for (size_t i = 0; i<num_cpus; ++i)
    {
      if (tid == i)
      {
        double t = measure_access(x, array_size, ntrips);
        printf("sequential core %d -> core 0 : BW %g MB/s\n",
            i, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
    }
    // }}}
  }
  numa_free(x, array_size);

  return 0;
}
Overwriting tmp/numa-bw-seq.c
In [11]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-bw-seq.c -onuma-bw-seq
! ./tmp/numa-bw-seq
sequential core 0 -> core 0 : BW 8322.05 MB/s
sequential core 1 -> core 0 : BW 8636.35 MB/s
sequential core 2 -> core 0 : BW 8713.88 MB/s
sequential core 3 -> core 0 : BW 8695.55 MB/s
sequential core 4 -> core 0 : BW 8649.28 MB/s
sequential core 5 -> core 0 : BW 8738.7 MB/s
sequential core 6 -> core 0 : BW 8665.42 MB/s
sequential core 7 -> core 0 : BW 8702.21 MB/s
sequential core 8 -> core 0 : BW 4821.3 MB/s
sequential core 9 -> core 0 : BW 4806.34 MB/s
sequential core 10 -> core 0 : BW 4817.99 MB/s
sequential core 11 -> core 0 : BW 4788.99 MB/s
sequential core 12 -> core 0 : BW 4824.2 MB/s
sequential core 13 -> core 0 : BW 4819.79 MB/s
sequential core 14 -> core 0 : BW 4828.46 MB/s
sequential core 15 -> core 0 : BW 4809.35 MB/s

Contention: Everybody

In [1]:
%%writefile tmp/numa-bw-all.c

#include "numatest.h"

int main(int argc, const char **argv)
{
  int num_cpus = numa_num_task_cpus();
  numa_set_localalloc();

  char *x;
  const size_t cache_line_size = 64;
  const size_t array_size = 100*1000*1000;
  size_t ntrips = 2;

#pragma omp parallel
  {
    assert(omp_get_num_threads() == num_cpus);
    int tid = omp_get_thread_num();

    pin_to_core(tid);
    if(tid == 0)
      x = (char *) numa_alloc_local(array_size);

    // {{{ everybody contends for one

    {
      if (tid == 0) puts("");

#pragma omp barrier
      double t = measure_access(x, array_size, ntrips);
#pragma omp barrier
      for (size_t i = 0; i<num_cpus; ++i)
      {
        if (tid == i)
          printf("all-contention core %d -> core 0 : BW %g MB/s\n",
              tid, array_size*ntrips*cache_line_size / t / 1e6);
#pragma omp barrier
      }
    }

    // }}}

  }
  numa_free(x, array_size);

  return 0;
}
Overwriting tmp/numa-bw-all.c
In [3]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-bw-all.c -onuma-bw-all
! ./tmp/numa-bw-all
all-contention core 0 -> core 0 : BW 1698.6 MB/s
all-contention core 1 -> core 0 : BW 1699.33 MB/s
all-contention core 2 -> core 0 : BW 1699.32 MB/s
all-contention core 3 -> core 0 : BW 1699.33 MB/s
all-contention core 4 -> core 0 : BW 1699.33 MB/s
all-contention core 5 -> core 0 : BW 1698.6 MB/s
all-contention core 6 -> core 0 : BW 1698.6 MB/s
all-contention core 7 -> core 0 : BW 1699.32 MB/s
all-contention core 8 -> core 0 : BW 1668.48 MB/s
all-contention core 9 -> core 0 : BW 1677.92 MB/s
all-contention core 10 -> core 0 : BW 1678.6 MB/s
all-contention core 11 -> core 0 : BW 1668.48 MB/s
all-contention core 12 -> core 0 : BW 1668.48 MB/s
all-contention core 13 -> core 0 : BW 1677.31 MB/s
all-contention core 14 -> core 0 : BW 1678.6 MB/s
all-contention core 15 -> core 0 : BW 1668.48 MB/s

Contention: Pairs

In [13]:
%%writefile tmp/numa-bw-two.c

#include "numatest.h"

int main(int argc, const char **argv)
{
  int num_cpus = numa_num_task_cpus();
  numa_set_localalloc();

  char *x;
  const size_t cache_line_size = 64;
  const size_t array_size = 100*1000*1000;
  size_t ntrips = 2;

#pragma omp parallel
  {
    assert(omp_get_num_threads() == num_cpus);
    int tid = omp_get_thread_num();

    pin_to_core(tid);
    if(tid == 0)
      x = (char *) numa_alloc_local(array_size);

    // {{{ zero and someone else contending

    if (tid == 0) puts("");

#pragma omp barrier
    for (size_t i = 1; i<num_cpus; ++i)
    {
      double t;
      if (tid == i || tid == 0)
        t = measure_access(x, array_size, ntrips);

#pragma omp barrier
      if (tid == 0)
      {
        printf("two-contention core %d -> core 0 : BW %g MB/s\n",
            tid, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
      if (tid == i)
      {
        printf("two-contention core %d -> core 0 : BW %g MB/s\n\n",
            tid, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
    }
  }
  numa_free(x, array_size);

  return 0;
}
Overwriting tmp/numa-bw-two.c
In [14]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-bw-two.c -onuma-bw-two
! ./tmp/numa-bw-two
two-contention core 0 -> core 0 : BW 7348.34 MB/s
two-contention core 1 -> core 0 : BW 7348.35 MB/s

two-contention core 0 -> core 0 : BW 7653.44 MB/s
two-contention core 2 -> core 0 : BW 7653.55 MB/s

two-contention core 0 -> core 0 : BW 7661.08 MB/s
two-contention core 3 -> core 0 : BW 7661.09 MB/s

two-contention core 0 -> core 0 : BW 7643.78 MB/s
two-contention core 4 -> core 0 : BW 7643.98 MB/s

two-contention core 0 -> core 0 : BW 7698.85 MB/s
two-contention core 5 -> core 0 : BW 7698.86 MB/s

two-contention core 0 -> core 0 : BW 7592.64 MB/s
two-contention core 6 -> core 0 : BW 7592.65 MB/s

two-contention core 0 -> core 0 : BW 7694.93 MB/s
two-contention core 7 -> core 0 : BW 7694.93 MB/s

two-contention core 0 -> core 0 : BW 7541.66 MB/s
two-contention core 8 -> core 0 : BW 4713.53 MB/s

two-contention core 0 -> core 0 : BW 7516.31 MB/s
two-contention core 9 -> core 0 : BW 4697.66 MB/s

two-contention core 0 -> core 0 : BW 7513.48 MB/s
two-contention core 10 -> core 0 : BW 4705.6 MB/s

two-contention core 0 -> core 0 : BW 7519.91 MB/s
two-contention core 11 -> core 0 : BW 4702.41 MB/s

two-contention core 0 -> core 0 : BW 7510.43 MB/s
two-contention core 12 -> core 0 : BW 4710.96 MB/s

two-contention core 0 -> core 0 : BW 7524.66 MB/s
two-contention core 13 -> core 0 : BW 4691.61 MB/s

two-contention core 0 -> core 0 : BW 7524.08 MB/s
two-contention core 14 -> core 0 : BW 4711.21 MB/s

two-contention core 0 -> core 0 : BW 7520.46 MB/s
two-contention core 15 -> core 0 : BW 4691.32 MB/s

Tests based on numatest.cpp by James Brock http://stackoverflow.com/questions/7259363/measuring-numa-non-uniform-memory-access-no-observable-asymmetry-why

Changes by Andreas Kloeckner, 10/2012:

  • Rewritten in C + OpenMP
  • Added contention tests