#!/usr/bin/env python
# coding: utf-8

# # NUMA and Bandwidths

# In[1]:


get_ipython().system('rm -Rf tmp')
get_ipython().system('mkdir -p tmp')


# ## Gathering Information

# In[2]:


get_ipython().run_cell_magic('writefile', 'tmp/numa-info.c', '\n#define _GNU_SOURCE\n#include <numa.h>\n#include <sched.h>\n#include <stdio.h>\n#include <pthread.h>\n#include <omp.h>\n#include <assert.h>\n#include "timing.h"\n\n\nvoid print_bitmask(const struct bitmask *bm)\n{\n  for(size_t i=0; i<bm->size; ++i)\n    printf("%d", numa_bitmask_isbitset(bm, i));\n}\n\n\nint main(int argc, const char **argv)\n{\n  int num_cpus = numa_num_task_cpus();\n  printf("num cpus: %d\\n", num_cpus);\n\n  printf("numa available: %d\\n", numa_available());\n  numa_set_localalloc();\n\n  struct bitmask *bm = numa_bitmask_alloc(num_cpus);\n  for (int i=0; i<=numa_max_node(); ++i)\n  {\n    numa_node_to_cpus(i, bm);\n    printf("numa node %d ", i);\n    print_bitmask(bm);\n    printf(" - %g GiB\\n", numa_node_size(i, 0) / (1024.*1024*1024.));\n  }\n  numa_bitmask_free(bm);\n\n  return 0;\n}\n')


# In[3]:


get_ipython().system(' cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-info.c -onuma-info  -lrt -lnuma')
get_ipython().system(' ./tmp/numa-info')


# ## A Shared Header

# In[4]:


get_ipython().run_cell_magic('writefile', 'tmp/numatest.h', '\n#define _GNU_SOURCE\n#include <numa.h>\n#include <sched.h>\n#include <stdio.h>\n#include <pthread.h>\n#include <omp.h>\n#include <assert.h>\n#include "timing.h"\n\nvoid pin_to_core(size_t core)\n{\n  cpu_set_t cpuset;\n  CPU_ZERO(&cpuset);\n  CPU_SET(core, &cpuset);\n  pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);\n}\n\ndouble measure_access(void *x, size_t array_size, size_t ntrips)\n{\n  timestamp_type t1;\n  get_timestamp(&t1);\n\n  for (size_t i = 0; i<ntrips; ++i)\n    for(size_t j = 0; j<array_size; ++j)\n    {\n      *(((char*)x) + ((j * 1009) % array_size)) += 1;\n    }\n\n  timestamp_type t2;\n  get_timestamp(&t2);\n\n  return timestamp_diff_in_seconds(t1, t2);\n}\n')


# ## On- and Off-Node Bandwidths

# In[5]:


get_ipython().run_cell_magic('writefile', 'tmp/numa-bw-seq.c', '\n#include "numatest.h"\n\nint main(int argc, const char **argv)\n{\n  int num_cpus = numa_num_task_cpus();\n  numa_set_localalloc();\n\n  char *x;\n  const size_t cache_line_size = 64;\n  const size_t array_size = 100*1000*1000;\n  size_t ntrips = 2;\n\n#pragma omp parallel\n  {\n    assert(omp_get_num_threads() == num_cpus);\n    int tid = omp_get_thread_num();\n\n    pin_to_core(tid);\n    if(tid == 0)\n      x = (char *) numa_alloc_local(array_size);\n\n    // {{{ single access\n#pragma omp barrier\n    for (size_t i = 0; i<num_cpus; ++i)\n    {\n      if (tid == i)\n      {\n        double t = measure_access(x, array_size, ntrips);\n        printf("sequential core %d -> core 0 : BW %g MB/s\\n",\n            i, array_size*ntrips*cache_line_size / t / 1e6);\n      }\n#pragma omp barrier\n    }\n    // }}}\n  }\n  numa_free(x, array_size);\n\n  return 0;\n}\n')


# In[6]:


get_ipython().system(' cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-bw-seq.c -onuma-bw-seq -lrt -lnuma')
get_ipython().system(' ./tmp/numa-bw-seq')


# ## Contention: Everybody

# In[7]:


get_ipython().run_cell_magic('writefile', 'tmp/numa-bw-all.c', '\n#include "numatest.h"\n\nint main(int argc, const char **argv)\n{\n  int num_cpus = numa_num_task_cpus();\n  numa_set_localalloc();\n\n  char *x;\n  const size_t cache_line_size = 64;\n  const size_t array_size = 100*1000*1000;\n  size_t ntrips = 2;\n\n#pragma omp parallel\n  {\n    assert(omp_get_num_threads() == num_cpus);\n    int tid = omp_get_thread_num();\n\n    pin_to_core(tid);\n    if(tid == 0)\n      x = (char *) numa_alloc_local(array_size);\n\n    // {{{ everybody contends for one\n\n    {\n      if (tid == 0) puts("");\n\n#pragma omp barrier\n      double t = measure_access(x, array_size, ntrips);\n#pragma omp barrier\n      for (size_t i = 0; i<num_cpus; ++i)\n      {\n        if (tid == i)\n          printf("all-contention core %d -> core 0 : BW %g MB/s\\n",\n              tid, array_size*ntrips*cache_line_size / t / 1e6);\n#pragma omp barrier\n      }\n    }\n\n    // }}}\n\n  }\n  numa_free(x, array_size);\n\n  return 0;\n}\n')


# In[9]:


get_ipython().system(' cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-bw-all.c -onuma-bw-all -lrt -lnuma')
get_ipython().system(' ./tmp/numa-bw-all')


# ## Contention: Pairs

# In[10]:


get_ipython().run_cell_magic('writefile', 'tmp/numa-bw-two.c', '\n#include "numatest.h"\n\nint main(int argc, const char **argv)\n{\n  int num_cpus = numa_num_task_cpus();\n  numa_set_localalloc();\n\n  char *x;\n  const size_t cache_line_size = 64;\n  const size_t array_size = 100*1000*1000;\n  size_t ntrips = 2;\n\n#pragma omp parallel\n  {\n    assert(omp_get_num_threads() == num_cpus);\n    int tid = omp_get_thread_num();\n\n    pin_to_core(tid);\n    if(tid == 0)\n      x = (char *) numa_alloc_local(array_size);\n\n    // {{{ zero and someone else contending\n\n    if (tid == 0) puts("");\n\n#pragma omp barrier\n    for (size_t i = 1; i<num_cpus; ++i)\n    {\n      double t;\n      if (tid == i || tid == 0)\n        t = measure_access(x, array_size, ntrips);\n\n#pragma omp barrier\n      if (tid == 0)\n      {\n        printf("two-contention core %d -> core 0 : BW %g MB/s\\n",\n            tid, array_size*ntrips*cache_line_size / t / 1e6);\n      }\n#pragma omp barrier\n      if (tid == i)\n      {\n        printf("two-contention core %d -> core 0 : BW %g MB/s\\n\\n",\n            tid, array_size*ntrips*cache_line_size / t / 1e6);\n      }\n#pragma omp barrier\n    }\n  }\n  numa_free(x, array_size);\n\n  return 0;\n}\n')


# In[11]:


get_ipython().system(' cd tmp; gcc -O3 -std=gnu99 -fopenmp -I.. numa-bw-two.c -onuma-bw-two -lrt -lnuma')
get_ipython().system(' ./tmp/numa-bw-two')


# Tests based on `numatest.cpp` by James Brock
# http://stackoverflow.com/questions/7259363/measuring-numa-non-uniform-memory-access-no-observable-asymmetry-why
# 
# Changes by Andreas Kloeckner, 10/2012:
# - Rewritten in C + OpenMP
# - Added contention tests
