NUMA and Bandwidths¶
In [1]:
!rm -Rf tmp
!mkdir -p tmp
/usr/lib/python3.13/pty.py:95: DeprecationWarning: This process (pid=1119399) is multi-threaded, use of forkpty() may lead to deadlocks in the child. pid, fd = os.forkpty()
Gathering Information¶
In [2]:
%%writefile tmp/numa-info.c
#define _GNU_SOURCE
#include <numa.h>
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <omp.h>
#include <assert.h>
#include "timing.h"
void print_bitmask(const struct bitmask *bm)
{
for(size_t i=0; i<bm->size; ++i)
printf("%d", numa_bitmask_isbitset(bm, i));
}
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
printf("num cpus: %d\n", num_cpus);
printf("numa available: %d\n", numa_available());
numa_set_localalloc();
struct bitmask *bm = numa_bitmask_alloc(num_cpus);
for (int i=0; i<=numa_max_node(); ++i)
{
numa_node_to_cpus(i, bm);
printf("numa node %d ", i);
print_bitmask(bm);
printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.));
}
numa_bitmask_free(bm);
return 0;
}
Writing tmp/numa-info.c
In [3]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-info.c -onuma-info -lrt -lnuma
! ./tmp/numa-info
num cpus: 40 numa available: 0 numa node 0 1111111111000000000011111111110000000000 - 125.787 GiB numa node 1 0000000000111111111100000000001111111111 - 125.991 GiB
A Shared Header¶
In [4]:
%%writefile tmp/numatest.h
#define _GNU_SOURCE
#include <numa.h>
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <omp.h>
#include <assert.h>
#include "timing.h"
void pin_to_core(size_t core)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
}
double measure_access(void *x, size_t array_size, size_t ntrips)
{
timestamp_type t1;
get_timestamp(&t1);
for (size_t i = 0; i<ntrips; ++i)
for(size_t j = 0; j<array_size; ++j)
{
*(((char*)x) + ((j * 1009) % array_size)) += 1;
}
timestamp_type t2;
get_timestamp(&t2);
return timestamp_diff_in_seconds(t1, t2);
}
Writing tmp/numatest.h
On- and Off-Node Bandwidths¶
In [5]:
%%writefile tmp/numa-bw-seq.c
#include "numatest.h"
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
numa_set_localalloc();
char *x;
const size_t cache_line_size = 64;
const size_t array_size = 100*1000*1000;
size_t ntrips = 2;
#pragma omp parallel
{
assert(omp_get_num_threads() == num_cpus);
int tid = omp_get_thread_num();
pin_to_core(tid);
if(tid == 0)
x = (char *) numa_alloc_local(array_size);
// {{{ single access
#pragma omp barrier
for (size_t i = 0; i<num_cpus; ++i)
{
if (tid == i)
{
double t = measure_access(x, array_size, ntrips);
printf("sequential core %d -> core 0 : BW %g MB/s\n",
i, array_size*ntrips*cache_line_size / t / 1e6);
}
#pragma omp barrier
}
// }}}
}
numa_free(x, array_size);
return 0;
}
Writing tmp/numa-bw-seq.c
In [6]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-bw-seq.c -onuma-bw-seq -lrt -lnuma
! ./tmp/numa-bw-seq
sequential core 0 -> core 0 : BW 9050.91 MB/s sequential core 1 -> core 0 : BW 9042.27 MB/s sequential core 2 -> core 0 : BW 8930.08 MB/s sequential core 3 -> core 0 : BW 8807.79 MB/s sequential core 4 -> core 0 : BW 8910.02 MB/s sequential core 5 -> core 0 : BW 8931.34 MB/s sequential core 6 -> core 0 : BW 8938.98 MB/s sequential core 7 -> core 0 : BW 8916.03 MB/s sequential core 8 -> core 0 : BW 8802.26 MB/s sequential core 9 -> core 0 : BW 8873.57 MB/s sequential core 10 -> core 0 : BW 5272.82 MB/s sequential core 11 -> core 0 : BW 5298.58 MB/s sequential core 12 -> core 0 : BW 5255.52 MB/s sequential core 13 -> core 0 : BW 5268.69 MB/s sequential core 14 -> core 0 : BW 5274.39 MB/s sequential core 15 -> core 0 : BW 5270.42 MB/s sequential core 16 -> core 0 : BW 5226.53 MB/s sequential core 17 -> core 0 : BW 5241.64 MB/s sequential core 18 -> core 0 : BW 5243.96 MB/s sequential core 19 -> core 0 : BW 5243.59 MB/s sequential core 20 -> core 0 : BW 8892.04 MB/s sequential core 21 -> core 0 : BW 8907.34 MB/s sequential core 22 -> core 0 : BW 8879.57 MB/s sequential core 23 -> core 0 : BW 8855.68 MB/s sequential core 24 -> core 0 : BW 8941.27 MB/s sequential core 25 -> core 0 : BW 8955.88 MB/s sequential core 26 -> core 0 : BW 8996.45 MB/s sequential core 27 -> core 0 : BW 8980.76 MB/s sequential core 28 -> core 0 : BW 8846.18 MB/s sequential core 29 -> core 0 : BW 8807.19 MB/s sequential core 30 -> core 0 : BW 5278.72 MB/s sequential core 31 -> core 0 : BW 5275.17 MB/s sequential core 32 -> core 0 : BW 5238.94 MB/s sequential core 33 -> core 0 : BW 5289.06 MB/s sequential core 34 -> core 0 : BW 5277.72 MB/s sequential core 35 -> core 0 : BW 5282.07 MB/s sequential core 36 -> core 0 : BW 5241.97 MB/s sequential core 37 -> core 0 : BW 5231.47 MB/s sequential core 38 -> core 0 : BW 5251.63 MB/s sequential core 39 -> core 0 : BW 5228.47 MB/s
Contention: Everybody¶
In [7]:
%%writefile tmp/numa-bw-all.c
#include "numatest.h"
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
numa_set_localalloc();
char *x;
const size_t cache_line_size = 64;
const size_t array_size = 100*1000*1000;
size_t ntrips = 2;
#pragma omp parallel
{
assert(omp_get_num_threads() == num_cpus);
int tid = omp_get_thread_num();
pin_to_core(tid);
if(tid == 0)
x = (char *) numa_alloc_local(array_size);
// {{{ everybody contends for one
{
if (tid == 0) puts("");
#pragma omp barrier
double t = measure_access(x, array_size, ntrips);
#pragma omp barrier
for (size_t i = 0; i<num_cpus; ++i)
{
if (tid == i)
printf("all-contention core %d -> core 0 : BW %g MB/s\n",
tid, array_size*ntrips*cache_line_size / t / 1e6);
#pragma omp barrier
}
}
// }}}
}
numa_free(x, array_size);
return 0;
}
Writing tmp/numa-bw-all.c
In [9]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-bw-all.c -onuma-bw-all -lrt -lnuma
! ./tmp/numa-bw-all
all-contention core 0 -> core 0 : BW 2005.95 MB/s all-contention core 1 -> core 0 : BW 2005.95 MB/s all-contention core 2 -> core 0 : BW 2005.95 MB/s all-contention core 3 -> core 0 : BW 2005.95 MB/s all-contention core 4 -> core 0 : BW 2005.95 MB/s all-contention core 5 -> core 0 : BW 2005.95 MB/s all-contention core 6 -> core 0 : BW 2005.95 MB/s all-contention core 7 -> core 0 : BW 2005.95 MB/s all-contention core 8 -> core 0 : BW 2005.95 MB/s all-contention core 9 -> core 0 : BW 1965.81 MB/s all-contention core 10 -> core 0 : BW 2244.92 MB/s all-contention core 11 -> core 0 : BW 2202.21 MB/s all-contention core 12 -> core 0 : BW 2244.92 MB/s all-contention core 13 -> core 0 : BW 2244.92 MB/s all-contention core 14 -> core 0 : BW 2244.92 MB/s all-contention core 15 -> core 0 : BW 2202.21 MB/s all-contention core 16 -> core 0 : BW 2244.91 MB/s all-contention core 17 -> core 0 : BW 2287.71 MB/s all-contention core 18 -> core 0 : BW 2244.92 MB/s all-contention core 19 -> core 0 : BW 2244.91 MB/s all-contention core 20 -> core 0 : BW 2005.95 MB/s all-contention core 21 -> core 0 : BW 2005.95 MB/s all-contention core 22 -> core 0 : BW 2004.08 MB/s all-contention core 23 -> core 0 : BW 2005.95 MB/s all-contention core 24 -> core 0 : BW 2005.95 MB/s all-contention core 25 -> core 0 : BW 2005.95 MB/s all-contention core 26 -> core 0 : BW 2005.95 MB/s all-contention core 27 -> core 0 : BW 2005.95 MB/s all-contention core 28 -> core 0 : BW 2005.95 MB/s all-contention core 29 -> core 0 : BW 2005.95 MB/s all-contention core 30 -> core 0 : BW 2244.92 MB/s all-contention core 31 -> core 0 : BW 2244.91 MB/s all-contention core 32 -> core 0 : BW 2244.92 MB/s all-contention core 33 -> core 0 : BW 2287.75 MB/s all-contention core 34 -> core 0 : BW 2244.92 MB/s all-contention core 35 -> core 0 : BW 2244.92 MB/s all-contention core 36 -> core 0 : BW 2244.91 MB/s all-contention core 37 -> core 0 : BW 2287.68 MB/s all-contention core 38 -> core 0 : BW 2244.92 MB/s all-contention core 39 -> core 0 : BW 2244.91 MB/s
Contention: Pairs¶
In [10]:
%%writefile tmp/numa-bw-two.c
#include "numatest.h"
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
numa_set_localalloc();
char *x;
const size_t cache_line_size = 64;
const size_t array_size = 100*1000*1000;
size_t ntrips = 2;
#pragma omp parallel
{
assert(omp_get_num_threads() == num_cpus);
int tid = omp_get_thread_num();
pin_to_core(tid);
if(tid == 0)
x = (char *) numa_alloc_local(array_size);
// {{{ zero and someone else contending
if (tid == 0) puts("");
#pragma omp barrier
for (size_t i = 1; i<num_cpus; ++i)
{
double t;
if (tid == i || tid == 0)
t = measure_access(x, array_size, ntrips);
#pragma omp barrier
if (tid == 0)
{
printf("two-contention core %d -> core 0 : BW %g MB/s\n",
tid, array_size*ntrips*cache_line_size / t / 1e6);
}
#pragma omp barrier
if (tid == i)
{
printf("two-contention core %d -> core 0 : BW %g MB/s\n\n",
tid, array_size*ntrips*cache_line_size / t / 1e6);
}
#pragma omp barrier
}
}
numa_free(x, array_size);
return 0;
}
Writing tmp/numa-bw-two.c
In [11]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-bw-two.c -onuma-bw-two -lrt -lnuma
! ./tmp/numa-bw-two
two-contention core 0 -> core 0 : BW 9151.35 MB/s two-contention core 1 -> core 0 : BW 9151.58 MB/s two-contention core 0 -> core 0 : BW 9456.32 MB/s two-contention core 2 -> core 0 : BW 9456.59 MB/s two-contention core 0 -> core 0 : BW 9522.92 MB/s two-contention core 3 -> core 0 : BW 9523.18 MB/s two-contention core 0 -> core 0 : BW 9529.5 MB/s two-contention core 4 -> core 0 : BW 9529.12 MB/s two-contention core 0 -> core 0 : BW 9497.15 MB/s two-contention core 5 -> core 0 : BW 9497.42 MB/s two-contention core 0 -> core 0 : BW 9561.61 MB/s two-contention core 6 -> core 0 : BW 9564.12 MB/s two-contention core 0 -> core 0 : BW 9529.33 MB/s two-contention core 7 -> core 0 : BW 9529.11 MB/s two-contention core 0 -> core 0 : BW 9438.81 MB/s two-contention core 8 -> core 0 : BW 9439.07 MB/s two-contention core 0 -> core 0 : BW 9519.34 MB/s two-contention core 9 -> core 0 : BW 9519.58 MB/s two-contention core 0 -> core 0 : BW 9535.17 MB/s two-contention core 10 -> core 0 : BW 5936.42 MB/s two-contention core 0 -> core 0 : BW 9527.58 MB/s two-contention core 11 -> core 0 : BW 5892.69 MB/s two-contention core 0 -> core 0 : BW 9535.6 MB/s two-contention core 12 -> core 0 : BW 5887.66 MB/s two-contention core 0 -> core 0 : BW 9541.36 MB/s two-contention core 13 -> core 0 : BW 5913.78 MB/s two-contention core 0 -> core 0 : BW 9545.8 MB/s two-contention core 14 -> core 0 : BW 5881.52 MB/s two-contention core 0 -> core 0 : BW 9547.44 MB/s two-contention core 15 -> core 0 : BW 5911.23 MB/s two-contention core 0 -> core 0 : BW 9462 MB/s two-contention core 16 -> core 0 : BW 5872.03 MB/s two-contention core 0 -> core 0 : BW 9437.58 MB/s two-contention core 17 -> core 0 : BW 5868.85 MB/s two-contention core 0 -> core 0 : BW 9460.75 MB/s two-contention core 18 -> core 0 : BW 5882.61 MB/s two-contention core 0 -> core 0 : BW 9470.81 MB/s two-contention core 19 -> core 0 : BW 5882.17 MB/s two-contention core 0 -> core 0 : BW 5359.29 MB/s two-contention core 20 -> core 0 : BW 5359.29 MB/s two-contention core 0 -> core 0 : BW 9561.09 MB/s two-contention core 21 -> core 0 : BW 9560.84 MB/s two-contention core 0 -> core 0 : BW 9413.73 MB/s two-contention core 22 -> core 0 : BW 9416.16 MB/s two-contention core 0 -> core 0 : BW 9300.45 MB/s two-contention core 23 -> core 0 : BW 9300.79 MB/s two-contention core 0 -> core 0 : BW 9565.24 MB/s two-contention core 24 -> core 0 : BW 9565.53 MB/s two-contention core 0 -> core 0 : BW 9402.21 MB/s two-contention core 25 -> core 0 : BW 9402 MB/s two-contention core 0 -> core 0 : BW 9517.25 MB/s two-contention core 26 -> core 0 : BW 9517.51 MB/s two-contention core 0 -> core 0 : BW 9618.22 MB/s two-contention core 27 -> core 0 : BW 9618.5 MB/s two-contention core 0 -> core 0 : BW 9571.53 MB/s two-contention core 28 -> core 0 : BW 9571.78 MB/s two-contention core 0 -> core 0 : BW 9582.94 MB/s two-contention core 29 -> core 0 : BW 9583.24 MB/s two-contention core 0 -> core 0 : BW 9433.29 MB/s two-contention core 30 -> core 0 : BW 5855.02 MB/s two-contention core 0 -> core 0 : BW 9435.66 MB/s two-contention core 31 -> core 0 : BW 5920.6 MB/s two-contention core 0 -> core 0 : BW 9451.84 MB/s two-contention core 32 -> core 0 : BW 5917.47 MB/s two-contention core 0 -> core 0 : BW 9419.94 MB/s two-contention core 33 -> core 0 : BW 5913.4 MB/s two-contention core 0 -> core 0 : BW 9437.07 MB/s two-contention core 34 -> core 0 : BW 5933.62 MB/s two-contention core 0 -> core 0 : BW 9447.02 MB/s two-contention core 35 -> core 0 : BW 5921.93 MB/s two-contention core 0 -> core 0 : BW 9445.1 MB/s two-contention core 36 -> core 0 : BW 5888.16 MB/s two-contention core 0 -> core 0 : BW 9561.59 MB/s two-contention core 37 -> core 0 : BW 5881.14 MB/s two-contention core 0 -> core 0 : BW 9457.36 MB/s two-contention core 38 -> core 0 : BW 5880.95 MB/s two-contention core 0 -> core 0 : BW 9436.63 MB/s two-contention core 39 -> core 0 : BW 5890.23 MB/s
Tests based on numatest.cpp
by James Brock
http://stackoverflow.com/questions/7259363/measuring-numa-non-uniform-memory-access-no-observable-asymmetry-why
Changes by Andreas Kloeckner, 10/2012:
- Rewritten in C + OpenMP
- Added contention tests