Low-Latency Systems Design: C++ vs Rust for High-Frequency Trading
Architectural patterns and implementation techniques for building sub-microsecond trading systems in C++ and Rust, with performance comparisons and trade-offs.
Architectural patterns and implementation techniques for building sub-microsecond trading systems in C++ and Rust, with performance comparisons and trade-offs.
In high-frequency trading (HFT), every nanosecond matters. The difference between a 500ns and 800ns order execution can mean millions in profit or loss. This article explores how to design and implement ultra-low-latency systems in C++ and Rust, comparing approaches and measuring real-world performance.
A typical market data-to-order pipeline has this latency budget:
1Network receive: 5,000 ns (hardware, unavoidable)
2Kernel processing: 2,000 ns (optimizable)
3Application logic: 500 ns (our focus)
4Order serialization: 300 ns (our focus)
5Network send: 5,000 ns (hardware, unavoidable)
6─────────────────────────────────
7Total: 12,800 ns (12.8 μs)
8Every nanosecond saved in application logic directly improves competitiveness.
1// BAD: Allocates on every iteration
2void process_market_data_slow(const MarketData& data) {
3 std::vector<Order> orders; // Heap allocation!
4
5 for (const auto& signal : generate_signals(data)) {
6 orders.push_back(create_order(signal)); // More allocations!
7 }
8
9 send_orders(orders);
10}
11A single malloc can take 50-100ns or more. In a hot path, this is unacceptable.
1template<typename T, size_t Capacity>
2class MemoryPool {
3 alignas(64) std::array<T, Capacity> pool_;
4 std::array<bool, Capacity> used_;
5 size_t next_free_ = 0;
6
7public:
8 T* allocate() {
9 // Lock-free allocation for single-threaded use
10 for (size_t i = 0; i < Capacity; ++i) {
11 size_t idx = (next_free_ + i) % Capacity;
12 if (!used_[idx]) {
13 used_[idx] = true;
14 next_free_ = (idx + 1) % Capacity;
15 return &pool_[idx];
16 }
17 }
18 return nullptr; // Pool exhausted
19 }
20
21 void deallocate(T* ptr) {
22 size_t idx = ptr - pool_.data();
23 assert(idx < Capacity);
24 used_[idx] = false;
25 }
26};
27
28// Usage
29inline MemoryPool<Order, 1024> order_pool;
30
31void process_market_data_fast(const MarketData& data) {
32 Order* orders[32];
33 size_t count = 0;
34
35 for (const auto& signal : generate_signals(data)) {
36 orders[count++] = order_pool.allocate();
37 *orders[count - 1] = create_order(signal);
38 }
39
40 send_orders(std::span(orders, count));
41
42 // Return to pool
43 for (size_t i = 0; i < count; ++i) {
44 order_pool.deallocate(orders[i]);
45 }
46}
471template<typename T, size_t N>
2class SmallVec {
3 union {
4 T inline_storage[N];
5 T* heap_storage;
6 };
7 size_t size_ = 0;
8 size_t capacity_ = N;
9 bool is_heap_ = false;
10
11public:
12 SmallVec() {}
13
14 ~SmallVec() {
15 if (is_heap_) {
16 delete[] heap_storage;
17 }
18 }
19
20 void push_back(const T& value) {
21 if (size_ < N) {
22 // Use inline storage
23 inline_storage[size_++] = value;
24 } else {
25 // Spill to heap (rare in hot path)
26 if (!is_heap_) {
27 T* new_storage = new T[N * 2];
28 std::copy(inline_storage, inline_storage + N, new_storage);
29 heap_storage = new_storage;
30 is_heap_ = true;
31 capacity_ = N * 2;
32 }
33 heap_storage[size_++] = value;
34 }
35 }
36
37 T* data() { return is_heap_ ? heap_storage : inline_storage; }
38 size_t size() const { return size_; }
39};
40
41// Usage - no heap allocation for typical case
42void process_market_data_optimized(const MarketData& data) {
43 SmallVec<Order, 16> orders; // 16 orders fit on stack
44
45 for (const auto& signal : generate_signals(data)) {
46 orders.push_back(create_order(signal));
47 }
48
49 send_orders(std::span(orders.data(), orders.size()));
50}
511use typed_arena::Arena;
2
3struct Order {
4 symbol: [u8; 8],
5 price: i64,
6 quantity: i64,
7 side: Side,
8}
9
10enum Side { Buy, Sell }
11
12fn process_market_data_rust(data: &MarketData) {
13 // Arena allocates in chunks, very fast
14 let arena = Arena::new();
15 let mut orders = Vec::new();
16
17 for signal in generate_signals(data) {
18 let order = arena.alloc(create_order(&signal));
19 orders.push(order);
20 }
21
22 send_orders(&orders);
23 // Arena deallocates everything at once when dropped
24}
25
26// Or use stack with SmallVec
27use smallvec::{SmallVec, smallvec};
28
29fn process_market_data_stack(data: &MarketData) {
30 // Up to 16 orders on stack
31 let mut orders: SmallVec<[Order; 16]> = smallvec![];
32
33 for signal in generate_signals(data) {
34 orders.push(create_order(&signal));
35 }
36
37 send_orders(&orders);
38}
39Modern CPUs have 64-byte cache lines. Accessing data that fits in one cache line is ~3x faster than spanning multiple lines.
1// BAD: Fields scattered across cache lines
2struct MarketData {
3 double bid; // 8 bytes
4 double ask; // 8 bytes
5 int64_t bid_size; // 8 bytes
6 int64_t ask_size; // 8 bytes
7 std::string symbol; // 32 bytes - pointer to heap!
8 int64_t timestamp; // 8 bytes
9 // Total: 72 bytes + heap allocation
10};
11
12// GOOD: Fits in one cache line
13struct alignas(64) MarketData {
14 uint64_t timestamp; // 8 bytes
15 int64_t bid_price; // 8 bytes (fixed-point)
16 int64_t ask_price; // 8 bytes
17 int32_t bid_size; // 4 bytes
18 int32_t ask_size; // 4 bytes
19 char symbol[12]; // 12 bytes (inline)
20 uint16_t flags; // 2 bytes
21 uint8_t _padding[14]; // Pad to 64 bytes
22 // Total: exactly 64 bytes, one cache line
23};
24
25static_assert(sizeof(MarketData) == 64);
26static_assert(alignof(MarketData) == 64);
271// BAD: Threads writing to adjacent memory
2struct Stats {
3 std::atomic<uint64_t> thread1_count; // byte 0-7
4 std::atomic<uint64_t> thread2_count; // byte 8-15
5 // Both in same cache line - false sharing!
6};
7
8// GOOD: Each atomic on its own cache line
9struct alignas(64) Stats {
10 alignas(64) std::atomic<uint64_t> thread1_count;
11 alignas(64) std::atomic<uint64_t> thread2_count;
12};
131use std::sync::atomic::{AtomicU64, Ordering};
2
3#[repr(align(64))]
4struct CacheLinePadded<T> {
5 value: T,
6}
7
8struct Stats {
9 thread1_count: CacheLinePadded<AtomicU64>,
10 thread2_count: CacheLinePadded<AtomicU64>,
11}
12
13impl Stats {
14 fn new() -> Self {
15 Self {
16 thread1_count: CacheLinePadded {
17 value: AtomicU64::new(0),
18 },
19 thread2_count: CacheLinePadded {
20 value: AtomicU64::new(0),
21 },
22 }
23 }
24}
251template<typename T, size_t Capacity>
2class alignas(64) SPSCQueue {
3 static_assert((Capacity & (Capacity - 1)) == 0, "Must be power of 2");
4
5 std::array<T, Capacity> buffer_;
6
7 // Producer and consumer on separate cache lines
8 alignas(64) std::atomic<size_t> write_pos_{0};
9 alignas(64) std::atomic<size_t> read_pos_{0};
10
11public:
12 bool try_push(const T& item) {
13 const size_t write = write_pos_.load(std::memory_order_relaxed);
14 const size_t next_write = (write + 1) & (Capacity - 1);
15
16 if (next_write == read_pos_.load(std::memory_order_acquire)) {
17 return false; // Queue full
18 }
19
20 buffer_[write] = item;
21 write_pos_.store(next_write, std::memory_order_release);
22 return true;
23 }
24
25 bool try_pop(T& item) {
26 const size_t read = read_pos_.load(std::memory_order_relaxed);
27
28 if (read == write_pos_.load(std::memory_order_acquire)) {
29 return false; // Queue empty
30 }
31
32 item = buffer_[read];
33 read_pos_.store((read + 1) & (Capacity - 1), std::memory_order_release);
34 return true;
35 }
36};
371use crossbeam::queue::ArrayQueue;
2
3struct MessageQueue {
4 queue: ArrayQueue<Order>,
5}
6
7impl MessageQueue {
8 fn new(capacity: usize) -> Self {
9 Self {
10 queue: ArrayQueue::new(capacity),
11 }
12 }
13
14 fn try_send(&self, order: Order) -> Result<(), Order> {
15 self.queue.push(order)
16 }
17
18 fn try_recv(&self) -> Option<Order> {
19 self.queue.pop()
20 }
21}
22
23// Usage in trading thread
24fn trading_loop(queue: &MessageQueue) {
25 loop {
26 if let Some(order) = queue.try_recv() {
27 process_order(order);
28 }
29
30 // Avoid spinning - use futex for sleeping
31 std::hint::spin_loop();
32 }
33}
341#include <pthread.h>
2#include <sched.h>
3
4void pin_thread_to_core(int core_id) {
5 cpu_set_t cpuset;
6 CPU_ZERO(&cpuset);
7 CPU_SET(core_id, &cpuset);
8
9 pthread_t current_thread = pthread_self();
10 int result = pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
11
12 if (result != 0) {
13 throw std::runtime_error("Failed to set thread affinity");
14 }
15
16 // Set realtime priority
17 sched_param param;
18 param.sched_priority = 99;
19 pthread_setschedparam(current_thread, SCHED_FIFO, ¶m);
20}
21
22// Usage
23void market_data_thread() {
24 pin_thread_to_core(2); // Dedicated core
25
26 while (running) {
27 // Process market data
28 }
29}
301use core_affinity;
2
3fn market_data_thread() {
4 // Pin to core 2
5 let core_ids = core_affinity::get_core_ids().unwrap();
6 core_affinity::set_for_current(core_ids[2]);
7
8 loop {
9 // Process market data
10 }
11}
121#include <immintrin.h>
2
3// Calculate 8 prices in parallel
4void calculate_mid_prices_simd(
5 const double* bids,
6 const double* asks,
7 double* mids,
8 size_t count
9) {
10 size_t i = 0;
11
12 // Process 4 doubles at a time with AVX
13 for (; i + 4 <= count; i += 4) {
14 __m256d bid_vec = _mm256_loadu_pd(&bids[i]);
15 __m256d ask_vec = _mm256_loadu_pd(&asks[i]);
16
17 __m256d sum = _mm256_add_pd(bid_vec, ask_vec);
18 __m256d mid = _mm256_mul_pd(sum, _mm256_set1_pd(0.5));
19
20 _mm256_storeu_pd(&mids[i], mid);
21 }
22
23 // Handle remainder
24 for (; i < count; ++i) {
25 mids[i] = (bids[i] + asks[i]) * 0.5;
26 }
27}
281use packed_simd::f64x4;
2
3fn calculate_mid_prices_simd(
4 bids: &[f64],
5 asks: &[f64],
6 mids: &mut [f64],
7) {
8 let chunks = bids.len() / 4;
9
10 for i in 0..chunks {
11 let bid_vec = f64x4::from_slice_unaligned(&bids[i * 4..]);
12 let ask_vec = f64x4::from_slice_unaligned(&asks[i * 4..]);
13
14 let sum = bid_vec + ask_vec;
15 let mid = sum * 0.5;
16
17 mid.write_to_slice_unaligned(&mut mids[i * 4..]);
18 }
19
20 // Handle remainder
21 let remainder_start = chunks * 4;
22 for i in remainder_start..bids.len() {
23 mids[i] = (bids[i] + asks[i]) * 0.5;
24 }
25}
261#include <rte_eal.h>
2#include <rte_ethdev.h>
3#include <rte_mbuf.h>
4
5class DPDKReceiver {
6 uint16_t port_id_;
7 rte_mempool* mbuf_pool_;
8
9public:
10 DPDKReceiver(uint16_t port_id) : port_id_(port_id) {
11 // Initialize DPDK
12 const char* argv[] = {"app", "-l", "0-3", "-n", "4"};
13 int argc = sizeof(argv) / sizeof(argv[0]);
14 rte_eal_init(argc, const_cast<char**>(argv));
15
16 // Create memory pool for packets
17 mbuf_pool_ = rte_pktmbuf_pool_create(
18 "mbuf_pool",
19 8192, // Number of elements
20 256, // Cache size
21 0, // Private data size
22 RTE_MBUF_DEFAULT_BUF_SIZE,
23 rte_socket_id()
24 );
25
26 // Configure port
27 rte_eth_conf port_conf = {};
28 rte_eth_dev_configure(port_id_, 1, 1, &port_conf);
29
30 // Setup RX queue
31 rte_eth_rx_queue_setup(port_id_, 0, 128,
32 rte_eth_dev_socket_id(port_id_), nullptr, mbuf_pool_);
33
34 // Start device
35 rte_eth_dev_start(port_id_);
36 }
37
38 void receive_packets() {
39 constexpr uint16_t BURST_SIZE = 32;
40 rte_mbuf* packets[BURST_SIZE];
41
42 while (true) {
43 uint16_t nb_rx = rte_eth_rx_burst(port_id_, 0, packets, BURST_SIZE);
44
45 for (uint16_t i = 0; i < nb_rx; ++i) {
46 process_packet(packets[i]);
47 rte_pktmbuf_free(packets[i]);
48 }
49 }
50 }
51
52 void process_packet(rte_mbuf* pkt) {
53 // Zero-copy access to packet data
54 uint8_t* data = rte_pktmbuf_mtod(pkt, uint8_t*);
55 size_t len = rte_pktmbuf_pkt_len(pkt);
56
57 // Parse and process market data directly
58 parse_market_data(data, len);
59 }
60};
611use io_uring::{opcode, types, IoUring, squeue};
2use std::os::unix::io::AsRawFd;
3use std::net::UdpSocket;
4
5struct UringReceiver {
6 ring: IoUring,
7 socket: UdpSocket,
8 buffers: Vec<Vec<u8>>,
9}
10
11impl UringReceiver {
12 fn new(socket: UdpSocket, buffer_count: usize) -> io::Result<Self> {
13 let ring = IoUring::builder()
14 .setup_sqpoll(1000) // Kernel thread polls
15 .build(256)?;
16
17 let buffers = (0..buffer_count)
18 .map(|_| vec![0u8; 9000]) // MTU size
19 .collect();
20
21 Ok(Self { ring, socket, buffers })
22 }
23
24 fn receive_loop(&mut self) -> io::Result<()> {
25 let fd = types::Fd(self.socket.as_raw_fd());
26
27 // Submit initial receive operations
28 for (i, buf) in self.buffers.iter_mut().enumerate() {
29 let recv_e = opcode::Recv::new(fd, buf.as_mut_ptr(), buf.len() as u32)
30 .build()
31 .user_data(i as u64);
32
33 unsafe {
34 self.ring.submission().push(&recv_e)?;
35 }
36 }
37
38 self.ring.submit()?;
39
40 loop {
41 // Wait for completions
42 self.ring.submit_and_wait(1)?;
43
44 for cqe in self.ring.completion() {
45 let buf_idx = cqe.user_data() as usize;
46 let bytes_read = cqe.result() as usize;
47
48 if bytes_read > 0 {
49 // Process packet
50 self.process_packet(&self.buffers[buf_idx][..bytes_read]);
51
52 // Resubmit receive
53 let buf = &mut self.buffers[buf_idx];
54 let recv_e = opcode::Recv::new(fd, buf.as_mut_ptr(), buf.len() as u32)
55 .build()
56 .user_data(buf_idx as u64);
57
58 unsafe {
59 self.ring.submission().push(&recv_e)?;
60 }
61 }
62 }
63
64 self.ring.submit()?;
65 }
66 }
67
68 fn process_packet(&self, data: &[u8]) {
69 // Parse market data
70 }
71}
72| Operation | C++ Baseline | C++ Optimized | Rust Baseline | Rust Optimized |
|---|---|---|---|---|
| Order Creation | 85/120/250 | 25/35/50 | 90/130/280 | 28/38/55 |
| Market Data Parse | 150/220/450 | 45/65/95 | 160/230/480 | 48/68/100 |
| Risk Check | 200/350/800 | 60/85/120 | 210/360/850 | 62/88/125 |
| Order Send | 300/500/1200 | 120/180/280 | 310/520/1300 | 125/185/290 |
| System | C++ | Rust |
|---|---|---|
| Market Data Processing | 12.5M | 11.8M |
| Order Generation | 8.2M | 7.9M |
| Full Pipeline | 3.5M | 3.3M |
From running low-latency systems in production:
mlock() prevents page faultsBoth C++ and Rust can achieve sub-microsecond latencies when properly optimized:
The choice depends on team skills, existing infrastructure, and whether you value absolute performance or safety+performance balance.
In our production HFT system, we use:
Technical Writer
NordVarg Engineering Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.