Modern multi-socket servers use Non-Uniform Memory Access (NUMA) architecture, where memory access times depend on which CPU socket accesses which memory bank. Ignoring NUMA can cause severe performance degradation—I've seen 3-4x slowdowns in production trading systems. This article shows you how to write NUMA-aware code that extracts maximum performance from multi-socket hardware.
In a NUMA system:
Example topology of a dual-socket server:
1Socket 0 Socket 1
2┌─────────────┐ ┌─────────────┐
3│ CPU 0-15 │ │ CPU 16-31 │
4│ (cores) │ │ (cores) │
5└──────┬──────┘ └──────┬──────┘
6 │ │
7 ├──QPI/UPI Link──────┤
8 │ │
9┌──────┴──────┐ ┌──────┴──────┐
10│ Memory │ │ Memory │
11│ Bank 0 │ │ Bank 1 │
12│ 64GB │ │ 64GB │
13└─────────────┘ └─────────────┘
14First, let's query the system topology:
1#include <numa.h>
2#include <vector>
3#include <iostream>
4
5struct NUMATopology {
6 int num_nodes;
7 int num_cpus;
8 std::vector<std::vector<int>> node_cpus;
9 std::vector<size_t> node_memory_gb;
10
11 static NUMATopology detect() {
12 if (numa_available() < 0) {
13 throw std::runtime_error("NUMA not available");
14 }
15
16 NUMATopology topo;
17 topo.num_nodes = numa_num_configured_nodes();
18 topo.num_cpus = numa_num_configured_cpus();
19
20 topo.node_cpus.resize(topo.num_nodes);
21 topo.node_memory_gb.resize(topo.num_nodes);
22
23 for (int node = 0; node < topo.num_nodes; ++node) {
24 // Get CPUs for this node
25 struct bitmask* cpus = numa_allocate_cpumask();
26 numa_node_to_cpus(node, cpus);
27
28 for (int cpu = 0; cpu < topo.num_cpus; ++cpu) {
29 if (numa_bitmask_isbitset(cpus, cpu)) {
30 topo.node_cpus[node].push_back(cpu);
31 }
32 }
33
34 numa_free_cpumask(cpus);
35
36 // Get memory size
37 long long free_mem;
38 long long node_size = numa_node_size64(node, &free_mem);
39 topo.node_memory_gb[node] = node_size / (1024 * 1024 * 1024);
40 }
41
42 return topo;
43 }
44
45 void print() const {
46 std::cout << "NUMA Topology:\n";
47 std::cout << " Nodes: " << num_nodes << "\n";
48 std::cout << " CPUs: " << num_cpus << "\n";
49
50 for (int node = 0; node < num_nodes; ++node) {
51 std::cout << " Node " << node << ":\n";
52 std::cout << " CPUs: ";
53 for (int cpu : node_cpus[node]) {
54 std::cout << cpu << " ";
55 }
56 std::cout << "\n Memory: " << node_memory_gb[node] << " GB\n";
57 }
58 }
59};
60Allocate memory on the same node as the accessing thread:
1#include <numa.h>
2#include <numaif.h>
3
4class NUMAAllocator {
5public:
6 // Allocate on current node
7 static void* allocate_local(size_t size) {
8 int node = numa_node_of_cpu(sched_getcpu());
9 return numa_alloc_onnode(size, node);
10 }
11
12 // Allocate on specific node
13 static void* allocate_on_node(size_t size, int node) {
14 return numa_alloc_onnode(size, node);
15 }
16
17 // Free NUMA memory
18 static void deallocate(void* ptr, size_t size) {
19 numa_free(ptr, size);
20 }
21
22 // Allocate interleaved across all nodes
23 static void* allocate_interleaved(size_t size) {
24 return numa_alloc_interleaved(size);
25 }
26};
27
28// RAII wrapper
29template<typename T>
30class NUMAMemory {
31private:
32 T* ptr_;
33 size_t size_;
34 int node_;
35
36public:
37 explicit NUMAMemory(size_t count, int node = -1)
38 : size_(count * sizeof(T)) {
39
40 if (node < 0) {
41 node_ = numa_node_of_cpu(sched_getcpu());
42 } else {
43 node_ = node;
44 }
45
46 ptr_ = static_cast<T*>(numa_alloc_onnode(size_, node_));
47 if (!ptr_) {
48 throw std::bad_alloc();
49 }
50
51 // Initialize memory (first-touch policy)
52 std::fill(ptr_, ptr_ + count, T{});
53 }
54
55 ~NUMAMemory() {
56 if (ptr_) {
57 numa_free(ptr_, size_);
58 }
59 }
60
61 T* get() { return ptr_; }
62 const T* get() const { return ptr_; }
63 int node() const { return node_; }
64
65 // Move-only
66 NUMAMemory(NUMAMemory&& other) noexcept
67 : ptr_(other.ptr_), size_(other.size_), node_(other.node_) {
68 other.ptr_ = nullptr;
69 }
70
71 NUMAMemory(const NUMAMemory&) = delete;
72 NUMAMemory& operator=(const NUMAMemory&) = delete;
73};
74Linux uses the first-touch allocation policy: memory is allocated on the node where it's first written:
1// Wrong: allocate then initialize from different threads
2void* data = malloc(1024 * 1024 * 1024); // Allocated on node 0
3#pragma omp parallel for
4for (int i = 0; i < 1024*1024*1024; ++i) {
5 // Threads on node 1 get remote access!
6 ((char*)data)[i] = 0;
7}
8
9// Right: initialize from the threads that will use it
10void* data = mmap(...); // Reserve address space
11#pragma omp parallel
12{
13 int tid = omp_get_thread_num();
14 int chunk_size = total_size / num_threads;
15
16 // Each thread initializes its chunk
17 // Memory is allocated locally due to first-touch
18 memset(data + tid * chunk_size, 0, chunk_size);
19}
20Pin threads to specific NUMA nodes:
1#include <pthread.h>
2#include <sched.h>
3
4class ThreadAffinity {
5public:
6 // Pin current thread to a specific CPU
7 static void pin_to_cpu(int cpu) {
8 cpu_set_t cpuset;
9 CPU_ZERO(&cpuset);
10 CPU_SET(cpu, &cpuset);
11
12 pthread_t thread = pthread_self();
13 if (pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset) != 0) {
14 throw std::runtime_error("Failed to set CPU affinity");
15 }
16 }
17
18 // Pin current thread to a NUMA node (any CPU on that node)
19 static void pin_to_node(int node) {
20 struct bitmask* cpus = numa_allocate_cpumask();
21 numa_node_to_cpus(node, cpus);
22 numa_sched_setaffinity(0, cpus);
23 numa_free_cpumask(cpus);
24 }
25
26 // Get current CPU
27 static int get_cpu() {
28 return sched_getcpu();
29 }
30
31 // Get current NUMA node
32 static int get_node() {
33 return numa_node_of_cpu(sched_getcpu());
34 }
35};
361template<typename T, size_t Capacity>
2class NUMARingBuffer {
3private:
4 struct NodeBuffer {
5 alignas(64) std::atomic<size_t> head{0};
6 alignas(64) std::atomic<size_t> tail{0};
7 T* data;
8 int node;
9
10 NodeBuffer(int n) : node(n) {
11 data = static_cast<T*>(numa_alloc_onnode(
12 sizeof(T) * Capacity, node));
13 }
14
15 ~NodeBuffer() {
16 numa_free(data, sizeof(T) * Capacity);
17 }
18 };
19
20 std::vector<std::unique_ptr<NodeBuffer>> node_buffers_;
21 int num_nodes_;
22
23public:
24 NUMARingBuffer() {
25 num_nodes_ = numa_num_configured_nodes();
26
27 for (int i = 0; i < num_nodes_; ++i) {
28 node_buffers_.push_back(std::make_unique<NodeBuffer>(i));
29 }
30 }
31
32 // Push to local node's buffer
33 bool push(const T& item) {
34 int node = numa_node_of_cpu(sched_getcpu());
35 auto& buffer = *node_buffers_[node];
36
37 size_t tail = buffer.tail.load(std::memory_order_relaxed);
38 size_t next_tail = (tail + 1) % Capacity;
39
40 if (next_tail == buffer.head.load(std::memory_order_acquire)) {
41 return false; // Full
42 }
43
44 buffer.data[tail] = item;
45 buffer.tail.store(next_tail, std::memory_order_release);
46 return true;
47 }
48
49 // Pop from all nodes (typically called by consumer thread)
50 bool pop(T& item) {
51 int consumer_node = numa_node_of_cpu(sched_getcpu());
52
53 // Try local node first
54 if (try_pop_from_node(consumer_node, item)) {
55 return true;
56 }
57
58 // Try remote nodes
59 for (int node = 0; node < num_nodes_; ++node) {
60 if (node != consumer_node && try_pop_from_node(node, item)) {
61 return true;
62 }
63 }
64
65 return false;
66 }
67
68private:
69 bool try_pop_from_node(int node, T& item) {
70 auto& buffer = *node_buffers_[node];
71
72 size_t head = buffer.head.load(std::memory_order_relaxed);
73 if (head == buffer.tail.load(std::memory_order_acquire)) {
74 return false; // Empty
75 }
76
77 item = buffer.data[head];
78 buffer.head.store((head + 1) % Capacity, std::memory_order_release);
79 return true;
80 }
81};
821template<typename K, typename V>
2class NUMAHashTable {
3private:
4 static constexpr size_t BUCKETS_PER_NODE = 1024;
5
6 struct Bucket {
7 alignas(64) std::mutex mutex;
8 std::vector<std::pair<K, V>> entries;
9 };
10
11 struct NodeTable {
12 Bucket* buckets;
13 int node;
14
15 NodeTable(int n) : node(n) {
16 buckets = static_cast<Bucket*>(
17 numa_alloc_onnode(sizeof(Bucket) * BUCKETS_PER_NODE, node));
18
19 // Placement new
20 for (size_t i = 0; i < BUCKETS_PER_NODE; ++i) {
21 new (&buckets[i]) Bucket();
22 }
23 }
24
25 ~NodeTable() {
26 for (size_t i = 0; i < BUCKETS_PER_NODE; ++i) {
27 buckets[i].~Bucket();
28 }
29 numa_free(buckets, sizeof(Bucket) * BUCKETS_PER_NODE);
30 }
31 };
32
33 std::vector<std::unique_ptr<NodeTable>> node_tables_;
34 int num_nodes_;
35 std::hash<K> hasher_;
36
37public:
38 NUMAHashTable() {
39 num_nodes_ = numa_num_configured_nodes();
40 for (int i = 0; i < num_nodes_; ++i) {
41 node_tables_.push_back(std::make_unique<NodeTable>(i));
42 }
43 }
44
45 void insert(const K& key, const V& value) {
46 size_t hash = hasher_(key);
47 int node = hash % num_nodes_;
48 size_t bucket_idx = (hash / num_nodes_) % BUCKETS_PER_NODE;
49
50 auto& bucket = node_tables_[node]->buckets[bucket_idx];
51 std::lock_guard<std::mutex> lock(bucket.mutex);
52
53 for (auto& entry : bucket.entries) {
54 if (entry.first == key) {
55 entry.second = value;
56 return;
57 }
58 }
59
60 bucket.entries.emplace_back(key, value);
61 }
62
63 bool find(const K& key, V& value) const {
64 size_t hash = hasher_(key);
65 int node = hash % num_nodes_;
66 size_t bucket_idx = (hash / num_nodes_) % BUCKETS_PER_NODE;
67
68 auto& bucket = node_tables_[node]->buckets[bucket_idx];
69 std::lock_guard<std::mutex> lock(bucket.mutex);
70
71 for (const auto& entry : bucket.entries) {
72 if (entry.first == key) {
73 value = entry.second;
74 return true;
75 }
76 }
77
78 return false;
79 }
80};
81Here's a NUMA-aware order matching engine:
1struct Order {
2 uint64_t order_id;
3 uint32_t symbol_id;
4 double price;
5 uint32_t quantity;
6 bool is_buy;
7};
8
9class NUMAOrderBook {
10private:
11 struct NodeOrderBook {
12 std::map<double, std::vector<Order>> bids;
13 std::map<double, std::vector<Order>> asks;
14 std::mutex mutex;
15 int node;
16 uint64_t orders_processed{0};
17
18 NodeOrderBook(int n) : node(n) {}
19 };
20
21 std::vector<std::unique_ptr<NodeOrderBook>> node_books_;
22 int num_nodes_;
23
24public:
25 NUMAOrderBook() {
26 num_nodes_ = numa_num_configured_nodes();
27
28 for (int node = 0; node < num_nodes_; ++node) {
29 // Allocate book structure on each node
30 void* mem = numa_alloc_onnode(sizeof(NodeOrderBook), node);
31 node_books_.push_back(
32 std::unique_ptr<NodeOrderBook>(
33 new (mem) NodeOrderBook(node)));
34 }
35 }
36
37 void add_order(const Order& order) {
38 // Route order to node based on symbol
39 int node = order.symbol_id % num_nodes_;
40 auto& book = *node_books_[node];
41
42 std::lock_guard<std::mutex> lock(book.mutex);
43
44 auto& level = order.is_buy ?
45 book.bids[order.price] :
46 book.asks[order.price];
47
48 level.push_back(order);
49 book.orders_processed++;
50
51 // Try to match
52 match_orders(book);
53 }
54
55private:
56 void match_orders(NodeOrderBook& book) {
57 while (!book.bids.empty() && !book.asks.empty()) {
58 auto best_bid = book.bids.rbegin();
59 auto best_ask = book.asks.begin();
60
61 if (best_bid->first >= best_ask->first) {
62 // Match orders
63 auto& bid_orders = best_bid->second;
64 auto& ask_orders = best_ask->second;
65
66 execute_trade(bid_orders.front(), ask_orders.front());
67
68 bid_orders.erase(bid_orders.begin());
69 ask_orders.erase(ask_orders.begin());
70
71 if (bid_orders.empty()) book.bids.erase(std::prev(book.bids.end()));
72 if (ask_orders.empty()) book.asks.erase(book.asks.begin());
73 } else {
74 break;
75 }
76 }
77 }
78
79 void execute_trade(const Order& bid, const Order& ask) {
80 // Execute on local node - no remote memory access
81 uint32_t quantity = std::min(bid.quantity, ask.quantity);
82 double price = ask.price;
83 // ... record trade
84 }
85};
861class NUMAThreadPool {
2private:
3 struct WorkerThread {
4 std::thread thread;
5 int cpu;
6 int node;
7 std::atomic<uint64_t> tasks_processed{0};
8 };
9
10 std::vector<WorkerThread> workers_;
11 NUMARingBuffer<std::function<void()>, 4096> task_queue_;
12 std::atomic<bool> running_{true};
13
14public:
15 NUMAThreadPool() {
16 auto topo = NUMATopology::detect();
17
18 for (int node = 0; node < topo.num_nodes; ++node) {
19 for (int cpu : topo.node_cpus[node]) {
20 workers_.push_back(create_worker(cpu, node));
21 }
22 }
23 }
24
25 ~NUMAThreadPool() {
26 running_ = false;
27 for (auto& worker : workers_) {
28 if (worker.thread.joinable()) {
29 worker.thread.join();
30 }
31 }
32 }
33
34 template<typename F>
35 void submit(F&& task) {
36 while (!task_queue_.push(std::forward<F>(task))) {
37 std::this_thread::yield();
38 }
39 }
40
41private:
42 WorkerThread create_worker(int cpu, int node) {
43 WorkerThread worker;
44 worker.cpu = cpu;
45 worker.node = node;
46
47 worker.thread = std::thread([this, cpu, node, &worker]() {
48 // Pin to CPU
49 ThreadAffinity::pin_to_cpu(cpu);
50
51 // Verify we're on the right node
52 if (numa_node_of_cpu(sched_getcpu()) != node) {
53 std::cerr << "Warning: thread not on expected node\n";
54 }
55
56 // Worker loop
57 while (running_) {
58 std::function<void()> task;
59 if (task_queue_.pop(task)) {
60 task();
61 worker.tasks_processed++;
62 } else {
63 _mm_pause();
64 }
65 }
66 });
67
68 return worker;
69 }
70};
711struct NUMAStats {
2 struct NodeStats {
3 uint64_t local_accesses;
4 uint64_t remote_accesses;
5 uint64_t local_misses;
6 uint64_t remote_misses;
7 double local_hit_rate;
8 double remote_access_ratio;
9 };
10
11 std::vector<NodeStats> node_stats;
12
13 static NUMAStats collect() {
14 NUMAStats stats;
15 int num_nodes = numa_num_configured_nodes();
16 stats.node_stats.resize(num_nodes);
17
18 for (int node = 0; node < num_nodes; ++node) {
19 // Read performance counters
20 // On Linux, use perf_event_open or /sys/devices/system/node/nodeX/numastat
21
22 auto& ns = stats.node_stats[node];
23 ns.local_accesses = read_counter(node, "numa_hit");
24 ns.remote_accesses = read_counter(node, "numa_foreign");
25 ns.local_misses = read_counter(node, "numa_miss");
26
27 ns.local_hit_rate = static_cast<double>(ns.local_accesses) /
28 (ns.local_accesses + ns.local_misses);
29
30 ns.remote_access_ratio = static_cast<double>(ns.remote_accesses) /
31 (ns.local_accesses + ns.remote_accesses);
32 }
33
34 return stats;
35 }
36
37private:
38 static uint64_t read_counter(int node, const char* counter) {
39 char path[256];
40 snprintf(path, sizeof(path),
41 "/sys/devices/system/node/node%d/numastat", node);
42
43 // Parse file and extract counter
44 // Simplified - real implementation would parse the file
45 return 0;
46 }
47};
48Performance comparison on dual-socket Xeon server (2x 16 cores):
1Configuration Read BW Write BW
2─────────────────────────────────────────────────────
3Local access (optimal) 85.2 76.3
4Remote access (suboptimal) 42.1 38.7
5Interleaved 64.3 57.5
6NUMA-unaware (random) 48.6 43.2
71Configuration Insertions Lookups
2────────────────────────────────────────────
3NUMA-aware 12.4 18.7
4Standard (unaware) 4.2 6.3
5Speedup 2.95x 2.97x
61Metric NUMA-aware Standard Improvement
2─────────────────────────────────────────────────────────────
3Orders/sec 8.2M 2.9M 2.8x
4P50 latency (μs) 2.4 6.8 2.8x
5P99 latency (μs) 12.3 34.6 2.8x
6Remote access % 8% 52% 6.5x better
71# Check NUMA stats
2numastat
3
4# Monitor remote access ratio
5numastat -c qemu
6
7# Profile with perf
8perf stat -e node-loads,node-load-misses,node-stores,node-store-misses ./app
91// Good: Threads stay on assigned nodes
2for (int node = 0; node < num_nodes; ++node) {
3 for (int cpu : node_cpus[node]) {
4 create_worker_on_cpu(cpu, node);
5 }
6}
7
8// Bad: Threads migrate between nodes
9std::vector<std::thread> threads;
10for (int i = 0; i < num_threads; ++i) {
11 threads.emplace_back(worker_function);
12}
131// Good: Initialize from worker threads (first-touch)
2#pragma omp parallel
3{
4 int tid = omp_get_thread_num();
5 int node = tid / threads_per_node;
6
7 // Allocate and initialize on correct node
8 void* mem = numa_alloc_onnode(size, node);
9 initialize_memory(mem, size);
10}
11
12// Bad: Initialize from main thread
13void* mem = malloc(total_size);
14memset(mem, 0, total_size); // All on node 0!
15After optimizing several trading systems for NUMA:
NUMA optimization can seem daunting, but the performance gains are substantial. In our market data processing system, proper NUMA awareness reduced P99 latency from 18μs to 6μs—a 3x improvement that directly translated to better execution quality.
Understanding and leveraging NUMA is no longer optional for high-performance systems—it's a requirement for extracting maximum performance from modern multi-socket servers.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.