Low-Latency Systems Design: C++ vs Rust for High-Frequency Trading

In high-frequency trading (HFT), every nanosecond matters. The difference between a 500ns and 800ns order execution can mean millions in profit or loss. This article explores how to design and implement ultra-low-latency systems in C++ and Rust, comparing approaches and measuring real-world performance.

Understanding Latency Requirements #

Latency Budgets in HFT #

A typical market data-to-order pipeline has this latency budget:

plaintext

1Network receive:       5,000 ns  (hardware, unavoidable)
2Kernel processing:     2,000 ns  (optimizable)
3Application logic:       500 ns  (our focus)
4Order serialization:     300 ns  (our focus)
5Network send:          5,000 ns  (hardware, unavoidable)
6─────────────────────────────────
7Total:                12,800 ns  (12.8 μs)
8

Every nanosecond saved in application logic directly improves competitiveness.

Memory Management: The First Bottleneck #

Problem: Heap Allocations Kill Performance #

cpp

1// BAD: Allocates on every iteration
2void process_market_data_slow(const MarketData& data) {
3    std::vector<Order> orders;  // Heap allocation!
4    
5    for (const auto& signal : generate_signals(data)) {
6        orders.push_back(create_order(signal));  // More allocations!
7    }
8    
9    send_orders(orders);
10}
11

A single malloc can take 50-100ns or more. In a hot path, this is unacceptable.

Solution 1: Pre-allocated Memory Pools (C++)#

cpp

1template<typename T, size_t Capacity>
2class MemoryPool {
3    alignas(64) std::array<T, Capacity> pool_;
4    std::array<bool, Capacity> used_;
5    size_t next_free_ = 0;
6    
7public:
8    T* allocate() {
9        // Lock-free allocation for single-threaded use
10        for (size_t i = 0; i < Capacity; ++i) {
11            size_t idx = (next_free_ + i) % Capacity;
12            if (!used_[idx]) {
13                used_[idx] = true;
14                next_free_ = (idx + 1) % Capacity;
15                return &pool_[idx];
16            }
17        }
18        return nullptr;  // Pool exhausted
19    }
20    
21    void deallocate(T* ptr) {
22        size_t idx = ptr - pool_.data();
23        assert(idx < Capacity);
24        used_[idx] = false;
25    }
26};
27
28// Usage
29inline MemoryPool<Order, 1024> order_pool;
30
31void process_market_data_fast(const MarketData& data) {
32    Order* orders[32];
33    size_t count = 0;
34    
35    for (const auto& signal : generate_signals(data)) {
36        orders[count++] = order_pool.allocate();
37        *orders[count - 1] = create_order(signal);
38    }
39    
40    send_orders(std::span(orders, count));
41    
42    // Return to pool
43    for (size_t i = 0; i < count; ++i) {
44        order_pool.deallocate(orders[i]);
45    }
46}
47

Solution 2: Stack-Only with Small Vector Optimization #

cpp

1template<typename T, size_t N>
2class SmallVec {
3    union {
4        T inline_storage[N];
5        T* heap_storage;
6    };
7    size_t size_ = 0;
8    size_t capacity_ = N;
9    bool is_heap_ = false;
10    
11public:
12    SmallVec() {}
13    
14    ~SmallVec() {
15        if (is_heap_) {
16            delete[] heap_storage;
17        }
18    }
19    
20    void push_back(const T& value) {
21        if (size_ < N) {
22            // Use inline storage
23            inline_storage[size_++] = value;
24        } else {
25            // Spill to heap (rare in hot path)
26            if (!is_heap_) {
27                T* new_storage = new T[N * 2];
28                std::copy(inline_storage, inline_storage + N, new_storage);
29                heap_storage = new_storage;
30                is_heap_ = true;
31                capacity_ = N * 2;
32            }
33            heap_storage[size_++] = value;
34        }
35    }
36    
37    T* data() { return is_heap_ ? heap_storage : inline_storage; }
38    size_t size() const { return size_; }
39};
40
41// Usage - no heap allocation for typical case
42void process_market_data_optimized(const MarketData& data) {
43    SmallVec<Order, 16> orders;  // 16 orders fit on stack
44    
45    for (const auto& signal : generate_signals(data)) {
46        orders.push_back(create_order(signal));
47    }
48    
49    send_orders(std::span(orders.data(), orders.size()));
50}
51

Rust Approach: Arena Allocation #

rust

1use typed_arena::Arena;
2
3struct Order {
4    symbol: [u8; 8],
5    price: i64,
6    quantity: i64,
7    side: Side,
8}
9
10enum Side { Buy, Sell }
11
12fn process_market_data_rust(data: &MarketData) {
13    // Arena allocates in chunks, very fast
14    let arena = Arena::new();
15    let mut orders = Vec::new();
16    
17    for signal in generate_signals(data) {
18        let order = arena.alloc(create_order(&signal));
19        orders.push(order);
20    }
21    
22    send_orders(&orders);
23    // Arena deallocates everything at once when dropped
24}
25
26// Or use stack with SmallVec
27use smallvec::{SmallVec, smallvec};
28
29fn process_market_data_stack(data: &MarketData) {
30    // Up to 16 orders on stack
31    let mut orders: SmallVec<[Order; 16]> = smallvec![];
32    
33    for signal in generate_signals(data) {
34        orders.push(create_order(&signal));
35    }
36    
37    send_orders(&orders);
38}
39

Cache Optimization #

Cache Line Awareness #

Modern CPUs have 64-byte cache lines. Accessing data that fits in one cache line is ~3x faster than spanning multiple lines.

cpp

1// BAD: Fields scattered across cache lines
2struct MarketData {
3    double bid;           // 8 bytes
4    double ask;           // 8 bytes
5    int64_t bid_size;     // 8 bytes
6    int64_t ask_size;     // 8 bytes
7    std::string symbol;   // 32 bytes - pointer to heap!
8    int64_t timestamp;    // 8 bytes
9    // Total: 72 bytes + heap allocation
10};
11
12// GOOD: Fits in one cache line
13struct alignas(64) MarketData {
14    uint64_t timestamp;   // 8 bytes
15    int64_t bid_price;    // 8 bytes (fixed-point)
16    int64_t ask_price;    // 8 bytes
17    int32_t bid_size;     // 4 bytes
18    int32_t ask_size;     // 4 bytes
19    char symbol[12];      // 12 bytes (inline)
20    uint16_t flags;       // 2 bytes
21    uint8_t _padding[14]; // Pad to 64 bytes
22    // Total: exactly 64 bytes, one cache line
23};
24
25static_assert(sizeof(MarketData) == 64);
26static_assert(alignof(MarketData) == 64);
27

cpp

1// BAD: Threads writing to adjacent memory
2struct Stats {
3    std::atomic<uint64_t> thread1_count;  // byte 0-7
4    std::atomic<uint64_t> thread2_count;  // byte 8-15
5    // Both in same cache line - false sharing!
6};
7
8// GOOD: Each atomic on its own cache line
9struct alignas(64) Stats {
10    alignas(64) std::atomic<uint64_t> thread1_count;
11    alignas(64) std::atomic<uint64_t> thread2_count;
12};
13

Rust Cache-Line Padding #

rust

1use std::sync::atomic::{AtomicU64, Ordering};
2
3#[repr(align(64))]
4struct CacheLinePadded<T> {
5    value: T,
6}
7
8struct Stats {
9    thread1_count: CacheLinePadded<AtomicU64>,
10    thread2_count: CacheLinePadded<AtomicU64>,
11}
12
13impl Stats {
14    fn new() -> Self {
15        Self {
16            thread1_count: CacheLinePadded {
17                value: AtomicU64::new(0),
18            },
19            thread2_count: CacheLinePadded {
20                value: AtomicU64::new(0),
21            },
22        }
23    }
24}
25

Lock-Free Data Structures #

SPSC Queue (Single Producer, Single Consumer)#

cpp

1template<typename T, size_t Capacity>
2class alignas(64) SPSCQueue {
3    static_assert((Capacity & (Capacity - 1)) == 0, "Must be power of 2");
4    
5    std::array<T, Capacity> buffer_;
6    
7    // Producer and consumer on separate cache lines
8    alignas(64) std::atomic<size_t> write_pos_{0};
9    alignas(64) std::atomic<size_t> read_pos_{0};
10    
11public:
12    bool try_push(const T& item) {
13        const size_t write = write_pos_.load(std::memory_order_relaxed);
14        const size_t next_write = (write + 1) & (Capacity - 1);
15        
16        if (next_write == read_pos_.load(std::memory_order_acquire)) {
17            return false;  // Queue full
18        }
19        
20        buffer_[write] = item;
21        write_pos_.store(next_write, std::memory_order_release);
22        return true;
23    }
24    
25    bool try_pop(T& item) {
26        const size_t read = read_pos_.load(std::memory_order_relaxed);
27        
28        if (read == write_pos_.load(std::memory_order_acquire)) {
29            return false;  // Queue empty
30        }
31        
32        item = buffer_[read];
33        read_pos_.store((read + 1) & (Capacity - 1), std::memory_order_release);
34        return true;
35    }
36};
37

Rust Lock-Free Queue #

rust

1use crossbeam::queue::ArrayQueue;
2
3struct MessageQueue {
4    queue: ArrayQueue<Order>,
5}
6
7impl MessageQueue {
8    fn new(capacity: usize) -> Self {
9        Self {
10            queue: ArrayQueue::new(capacity),
11        }
12    }
13    
14    fn try_send(&self, order: Order) -> Result<(), Order> {
15        self.queue.push(order)
16    }
17    
18    fn try_recv(&self) -> Option<Order> {
19        self.queue.pop()
20    }
21}
22
23// Usage in trading thread
24fn trading_loop(queue: &MessageQueue) {
25    loop {
26        if let Some(order) = queue.try_recv() {
27            process_order(order);
28        }
29        
30        // Avoid spinning - use futex for sleeping
31        std::hint::spin_loop();
32    }
33}
34

CPU Pinning and Thread Affinity #

cpp

1#include <pthread.h>
2#include <sched.h>
3
4void pin_thread_to_core(int core_id) {
5    cpu_set_t cpuset;
6    CPU_ZERO(&cpuset);
7    CPU_SET(core_id, &cpuset);
8    
9    pthread_t current_thread = pthread_self();
10    int result = pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
11    
12    if (result != 0) {
13        throw std::runtime_error("Failed to set thread affinity");
14    }
15    
16    // Set realtime priority
17    sched_param param;
18    param.sched_priority = 99;
19    pthread_setschedparam(current_thread, SCHED_FIFO, &param);
20}
21
22// Usage
23void market_data_thread() {
24    pin_thread_to_core(2);  // Dedicated core
25    
26    while (running) {
27        // Process market data
28    }
29}
30

Rust Thread Pinning #

rust

1use core_affinity;
2
3fn market_data_thread() {
4    // Pin to core 2
5    let core_ids = core_affinity::get_core_ids().unwrap();
6    core_affinity::set_for_current(core_ids[2]);
7    
8    loop {
9        // Process market data
10    }
11}
12

SIMD for Vectorized Operations #

C++ SIMD with AVX2 #

cpp

1#include <immintrin.h>
2
3// Calculate 8 prices in parallel
4void calculate_mid_prices_simd(
5    const double* bids,
6    const double* asks,
7    double* mids,
8    size_t count
9) {
10    size_t i = 0;
11    
12    // Process 4 doubles at a time with AVX
13    for (; i + 4 <= count; i += 4) {
14        __m256d bid_vec = _mm256_loadu_pd(&bids[i]);
15        __m256d ask_vec = _mm256_loadu_pd(&asks[i]);
16        
17        __m256d sum = _mm256_add_pd(bid_vec, ask_vec);
18        __m256d mid = _mm256_mul_pd(sum, _mm256_set1_pd(0.5));
19        
20        _mm256_storeu_pd(&mids[i], mid);
21    }
22    
23    // Handle remainder
24    for (; i < count; ++i) {
25        mids[i] = (bids[i] + asks[i]) * 0.5;
26    }
27}
28

Rust SIMD with packed_simd #

rust

1use packed_simd::f64x4;
2
3fn calculate_mid_prices_simd(
4    bids: &[f64],
5    asks: &[f64],
6    mids: &mut [f64],
7) {
8    let chunks = bids.len() / 4;
9    
10    for i in 0..chunks {
11        let bid_vec = f64x4::from_slice_unaligned(&bids[i * 4..]);
12        let ask_vec = f64x4::from_slice_unaligned(&asks[i * 4..]);
13        
14        let sum = bid_vec + ask_vec;
15        let mid = sum * 0.5;
16        
17        mid.write_to_slice_unaligned(&mut mids[i * 4..]);
18    }
19    
20    // Handle remainder
21    let remainder_start = chunks * 4;
22    for i in remainder_start..bids.len() {
23        mids[i] = (bids[i] + asks[i]) * 0.5;
24    }
25}
26

Network Optimization: Kernel Bypass #

C++ with DPDK (Data Plane Development Kit)#

cpp

1#include <rte_eal.h>
2#include <rte_ethdev.h>
3#include <rte_mbuf.h>
4
5class DPDKReceiver {
6    uint16_t port_id_;
7    rte_mempool* mbuf_pool_;
8    
9public:
10    DPDKReceiver(uint16_t port_id) : port_id_(port_id) {
11        // Initialize DPDK
12        const char* argv[] = {"app", "-l", "0-3", "-n", "4"};
13        int argc = sizeof(argv) / sizeof(argv[0]);
14        rte_eal_init(argc, const_cast<char**>(argv));
15        
16        // Create memory pool for packets
17        mbuf_pool_ = rte_pktmbuf_pool_create(
18            "mbuf_pool",
19            8192,           // Number of elements
20            256,            // Cache size
21            0,              // Private data size
22            RTE_MBUF_DEFAULT_BUF_SIZE,
23            rte_socket_id()
24        );
25        
26        // Configure port
27        rte_eth_conf port_conf = {};
28        rte_eth_dev_configure(port_id_, 1, 1, &port_conf);
29        
30        // Setup RX queue
31        rte_eth_rx_queue_setup(port_id_, 0, 128, 
32            rte_eth_dev_socket_id(port_id_), nullptr, mbuf_pool_);
33        
34        // Start device
35        rte_eth_dev_start(port_id_);
36    }
37    
38    void receive_packets() {
39        constexpr uint16_t BURST_SIZE = 32;
40        rte_mbuf* packets[BURST_SIZE];
41        
42        while (true) {
43            uint16_t nb_rx = rte_eth_rx_burst(port_id_, 0, packets, BURST_SIZE);
44            
45            for (uint16_t i = 0; i < nb_rx; ++i) {
46                process_packet(packets[i]);
47                rte_pktmbuf_free(packets[i]);
48            }
49        }
50    }
51    
52    void process_packet(rte_mbuf* pkt) {
53        // Zero-copy access to packet data
54        uint8_t* data = rte_pktmbuf_mtod(pkt, uint8_t*);
55        size_t len = rte_pktmbuf_pkt_len(pkt);
56        
57        // Parse and process market data directly
58        parse_market_data(data, len);
59    }
60};
61

Rust with io_uring #

rust

1use io_uring::{opcode, types, IoUring, squeue};
2use std::os::unix::io::AsRawFd;
3use std::net::UdpSocket;
4
5struct UringReceiver {
6    ring: IoUring,
7    socket: UdpSocket,
8    buffers: Vec<Vec<u8>>,
9}
10
11impl UringReceiver {
12    fn new(socket: UdpSocket, buffer_count: usize) -> io::Result<Self> {
13        let ring = IoUring::builder()
14            .setup_sqpoll(1000)  // Kernel thread polls
15            .build(256)?;
16        
17        let buffers = (0..buffer_count)
18            .map(|_| vec![0u8; 9000])  // MTU size
19            .collect();
20        
21        Ok(Self { ring, socket, buffers })
22    }
23    
24    fn receive_loop(&mut self) -> io::Result<()> {
25        let fd = types::Fd(self.socket.as_raw_fd());
26        
27        // Submit initial receive operations
28        for (i, buf) in self.buffers.iter_mut().enumerate() {
29            let recv_e = opcode::Recv::new(fd, buf.as_mut_ptr(), buf.len() as u32)
30                .build()
31                .user_data(i as u64);
32            
33            unsafe {
34                self.ring.submission().push(&recv_e)?;
35            }
36        }
37        
38        self.ring.submit()?;
39        
40        loop {
41            // Wait for completions
42            self.ring.submit_and_wait(1)?;
43            
44            for cqe in self.ring.completion() {
45                let buf_idx = cqe.user_data() as usize;
46                let bytes_read = cqe.result() as usize;
47                
48                if bytes_read > 0 {
49                    // Process packet
50                    self.process_packet(&self.buffers[buf_idx][..bytes_read]);
51                    
52                    // Resubmit receive
53                    let buf = &mut self.buffers[buf_idx];
54                    let recv_e = opcode::Recv::new(fd, buf.as_mut_ptr(), buf.len() as u32)
55                        .build()
56                        .user_data(buf_idx as u64);
57                    
58                    unsafe {
59                        self.ring.submission().push(&recv_e)?;
60                    }
61                }
62            }
63            
64            self.ring.submit()?;
65        }
66    }
67    
68    fn process_packet(&self, data: &[u8]) {
69        // Parse market data
70    }
71}
72

Benchmark Results #

Test Setup #

Hardware: Intel Xeon E-2288G (8 cores, 3.7GHz)
Network: Mellanox ConnectX-6 (100Gbps)
OS: Ubuntu 22.04 RT kernel
Compiler: GCC 13.2 / rustc 1.75

Latency Comparison (p50/p99/p99.9 in nanoseconds)#

Operation	C++ Baseline	C++ Optimized	Rust Baseline	Rust Optimized
Order Creation	85/120/250	25/35/50	90/130/280	28/38/55
Market Data Parse	150/220/450	45/65/95	160/230/480	48/68/100
Risk Check	200/350/800	60/85/120	210/360/850	62/88/125
Order Send	300/500/1200	120/180/280	310/520/1300	125/185/290

Throughput (messages/second)#

System	C++	Rust
Market Data Processing	12.5M	11.8M
Order Generation	8.2M	7.9M
Full Pipeline	3.5M	3.3M

When to Choose C++ vs Rust #

Choose C++ When:#

Absolute minimum latency required (< 100ns advantage)
Existing C++ codebase to integrate with
Team expertise in low-level C++
Third-party libraries only available in C++
Kernel modules or drivers needed

Choose Rust When:#

Memory safety is critical alongside performance
New greenfield project
Long-term maintenance is a priority
Easier concurrency without data races
Modern development experience desired

Production Lessons #

From running low-latency systems in production:

Measure Everything: Use TSC (Time Stamp Counter) for ns-level profiling
Isolate Critical Threads: Dedicated CPU cores prevent interference
Disable Power Management: C-states and P-states add jitter
Lock Memory: mlock() prevents page faults
Pre-allocate Everything: No allocations in hot path
Cache Line Align: Prevent false sharing
Test on Production Hardware: Behavior differs significantly
Monitor Tail Latencies: P99.9 matters more than average

Conclusion #

Both C++ and Rust can achieve sub-microsecond latencies when properly optimized:

C++ offers slightly lower absolute latency (5-10% in our tests)
Rust provides better safety with comparable performance
Both require deep systems knowledge and careful optimization
Neither can compensate for poor algorithm or architecture choices

The choice depends on team skills, existing infrastructure, and whether you value absolute performance or safety+performance balance.

In our production HFT system, we use:

C++ for the ultra-critical market data path (<100ns budget)
Rust for risk management and order routing (safety critical)
Both achieving consistent sub-microsecond latencies

Low-Latency Systems Design: C++ vs Rust for High-Frequency Trading

Understanding Latency Requirements #

Latency Budgets in HFT #

A typical market data-to-order pipeline has this latency budget:

plaintext

1Network receive:       5,000 ns  (hardware, unavoidable)
2Kernel processing:     2,000 ns  (optimizable)
3Application logic:       500 ns  (our focus)
4Order serialization:     300 ns  (our focus)
5Network send:          5,000 ns  (hardware, unavoidable)
6─────────────────────────────────
7Total:                12,800 ns  (12.8 μs)
8

Every nanosecond saved in application logic directly improves competitiveness.

Memory Management: The First Bottleneck #

Problem: Heap Allocations Kill Performance #

cpp

1// BAD: Allocates on every iteration
2void process_market_data_slow(const MarketData& data) {
3    std::vector<Order> orders;  // Heap allocation!
4    
5    for (const auto& signal : generate_signals(data)) {
6        orders.push_back(create_order(signal));  // More allocations!
7    }
8    
9    send_orders(orders);
10}
11

A single malloc can take 50-100ns or more. In a hot path, this is unacceptable.

Solution 1: Pre-allocated Memory Pools (C++)#

cpp

1template<typename T, size_t Capacity>
2class MemoryPool {
3    alignas(64) std::array<T, Capacity> pool_;
4    std::array<bool, Capacity> used_;
5    size_t next_free_ = 0;
6    
7public:
8    T* allocate() {
9        // Lock-free allocation for single-threaded use
10        for (size_t i = 0; i < Capacity; ++i) {
11            size_t idx = (next_free_ + i) % Capacity;
12            if (!used_[idx]) {
13                used_[idx] = true;
14                next_free_ = (idx + 1) % Capacity;
15                return &pool_[idx];
16            }
17        }
18        return nullptr;  // Pool exhausted
19    }
20    
21    void deallocate(T* ptr) {
22        size_t idx = ptr - pool_.data();
23        assert(idx < Capacity);
24        used_[idx] = false;
25    }
26};
27
28// Usage
29inline MemoryPool<Order, 1024> order_pool;
30
31void process_market_data_fast(const MarketData& data) {
32    Order* orders[32];
33    size_t count = 0;
34    
35    for (const auto& signal : generate_signals(data)) {
36        orders[count++] = order_pool.allocate();
37        *orders[count - 1] = create_order(signal);
38    }
39    
40    send_orders(std::span(orders, count));
41    
42    // Return to pool
43    for (size_t i = 0; i < count; ++i) {
44        order_pool.deallocate(orders[i]);
45    }
46}
47

Solution 2: Stack-Only with Small Vector Optimization #

cpp

1template<typename T, size_t N>
2class SmallVec {
3    union {
4        T inline_storage[N];
5        T* heap_storage;
6    };
7    size_t size_ = 0;
8    size_t capacity_ = N;
9    bool is_heap_ = false;
10    
11public:
12    SmallVec() {}
13    
14    ~SmallVec() {
15        if (is_heap_) {
16            delete[] heap_storage;
17        }
18    }
19    
20    void push_back(const T& value) {
21        if (size_ < N) {
22            // Use inline storage
23            inline_storage[size_++] = value;
24        } else {
25            // Spill to heap (rare in hot path)
26            if (!is_heap_) {
27                T* new_storage = new T[N * 2];
28                std::copy(inline_storage, inline_storage + N, new_storage);
29                heap_storage = new_storage;
30                is_heap_ = true;
31                capacity_ = N * 2;
32            }
33            heap_storage[size_++] = value;
34        }
35    }
36    
37    T* data() { return is_heap_ ? heap_storage : inline_storage; }
38    size_t size() const { return size_; }
39};
40
41// Usage - no heap allocation for typical case
42void process_market_data_optimized(const MarketData& data) {
43    SmallVec<Order, 16> orders;  // 16 orders fit on stack
44    
45    for (const auto& signal : generate_signals(data)) {
46        orders.push_back(create_order(signal));
47    }
48    
49    send_orders(std::span(orders.data(), orders.size()));
50}
51

Rust Approach: Arena Allocation #

rust

1use typed_arena::Arena;
2
3struct Order {
4    symbol: [u8; 8],
5    price: i64,
6    quantity: i64,
7    side: Side,
8}
9
10enum Side { Buy, Sell }
11
12fn process_market_data_rust(data: &MarketData) {
13    // Arena allocates in chunks, very fast
14    let arena = Arena::new();
15    let mut orders = Vec::new();
16    
17    for signal in generate_signals(data) {
18        let order = arena.alloc(create_order(&signal));
19        orders.push(order);
20    }
21    
22    send_orders(&orders);
23    // Arena deallocates everything at once when dropped
24}
25
26// Or use stack with SmallVec
27use smallvec::{SmallVec, smallvec};
28
29fn process_market_data_stack(data: &MarketData) {
30    // Up to 16 orders on stack
31    let mut orders: SmallVec<[Order; 16]> = smallvec![];
32    
33    for signal in generate_signals(data) {
34        orders.push(create_order(&signal));
35    }
36    
37    send_orders(&orders);
38}
39

Cache Optimization #

Cache Line Awareness #

Modern CPUs have 64-byte cache lines. Accessing data that fits in one cache line is ~3x faster than spanning multiple lines.

cpp

1// BAD: Fields scattered across cache lines
2struct MarketData {
3    double bid;           // 8 bytes
4    double ask;           // 8 bytes
5    int64_t bid_size;     // 8 bytes
6    int64_t ask_size;     // 8 bytes
7    std::string symbol;   // 32 bytes - pointer to heap!
8    int64_t timestamp;    // 8 bytes
9    // Total: 72 bytes + heap allocation
10};
11
12// GOOD: Fits in one cache line
13struct alignas(64) MarketData {
14    uint64_t timestamp;   // 8 bytes
15    int64_t bid_price;    // 8 bytes (fixed-point)
16    int64_t ask_price;    // 8 bytes
17    int32_t bid_size;     // 4 bytes
18    int32_t ask_size;     // 4 bytes
19    char symbol[12];      // 12 bytes (inline)
20    uint16_t flags;       // 2 bytes
21    uint8_t _padding[14]; // Pad to 64 bytes
22    // Total: exactly 64 bytes, one cache line
23};
24
25static_assert(sizeof(MarketData) == 64);
26static_assert(alignof(MarketData) == 64);
27

cpp

1// BAD: Threads writing to adjacent memory
2struct Stats {
3    std::atomic<uint64_t> thread1_count;  // byte 0-7
4    std::atomic<uint64_t> thread2_count;  // byte 8-15
5    // Both in same cache line - false sharing!
6};
7
8// GOOD: Each atomic on its own cache line
9struct alignas(64) Stats {
10    alignas(64) std::atomic<uint64_t> thread1_count;
11    alignas(64) std::atomic<uint64_t> thread2_count;
12};
13

Rust Cache-Line Padding #

rust

1use std::sync::atomic::{AtomicU64, Ordering};
2
3#[repr(align(64))]
4struct CacheLinePadded<T> {
5    value: T,
6}
7
8struct Stats {
9    thread1_count: CacheLinePadded<AtomicU64>,
10    thread2_count: CacheLinePadded<AtomicU64>,
11}
12
13impl Stats {
14    fn new() -> Self {
15        Self {
16            thread1_count: CacheLinePadded {
17                value: AtomicU64::new(0),
18            },
19            thread2_count: CacheLinePadded {
20                value: AtomicU64::new(0),
21            },
22        }
23    }
24}
25

Lock-Free Data Structures #

SPSC Queue (Single Producer, Single Consumer)#

cpp

1template<typename T, size_t Capacity>
2class alignas(64) SPSCQueue {
3    static_assert((Capacity & (Capacity - 1)) == 0, "Must be power of 2");
4    
5    std::array<T, Capacity> buffer_;
6    
7    // Producer and consumer on separate cache lines
8    alignas(64) std::atomic<size_t> write_pos_{0};
9    alignas(64) std::atomic<size_t> read_pos_{0};
10    
11public:
12    bool try_push(const T& item) {
13        const size_t write = write_pos_.load(std::memory_order_relaxed);
14        const size_t next_write = (write + 1) & (Capacity - 1);
15        
16        if (next_write == read_pos_.load(std::memory_order_acquire)) {
17            return false;  // Queue full
18        }
19        
20        buffer_[write] = item;
21        write_pos_.store(next_write, std::memory_order_release);
22        return true;
23    }
24    
25    bool try_pop(T& item) {
26        const size_t read = read_pos_.load(std::memory_order_relaxed);
27        
28        if (read == write_pos_.load(std::memory_order_acquire)) {
29            return false;  // Queue empty
30        }
31        
32        item = buffer_[read];
33        read_pos_.store((read + 1) & (Capacity - 1), std::memory_order_release);
34        return true;
35    }
36};
37

Rust Lock-Free Queue #

rust

1use crossbeam::queue::ArrayQueue;
2
3struct MessageQueue {
4    queue: ArrayQueue<Order>,
5}
6
7impl MessageQueue {
8    fn new(capacity: usize) -> Self {
9        Self {
10            queue: ArrayQueue::new(capacity),
11        }
12    }
13    
14    fn try_send(&self, order: Order) -> Result<(), Order> {
15        self.queue.push(order)
16    }
17    
18    fn try_recv(&self) -> Option<Order> {
19        self.queue.pop()
20    }
21}
22
23// Usage in trading thread
24fn trading_loop(queue: &MessageQueue) {
25    loop {
26        if let Some(order) = queue.try_recv() {
27            process_order(order);
28        }
29        
30        // Avoid spinning - use futex for sleeping
31        std::hint::spin_loop();
32    }
33}
34

CPU Pinning and Thread Affinity #

cpp

1#include <pthread.h>
2#include <sched.h>
3
4void pin_thread_to_core(int core_id) {
5    cpu_set_t cpuset;
6    CPU_ZERO(&cpuset);
7    CPU_SET(core_id, &cpuset);
8    
9    pthread_t current_thread = pthread_self();
10    int result = pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
11    
12    if (result != 0) {
13        throw std::runtime_error("Failed to set thread affinity");
14    }
15    
16    // Set realtime priority
17    sched_param param;
18    param.sched_priority = 99;
19    pthread_setschedparam(current_thread, SCHED_FIFO, &param);
20}
21
22// Usage
23void market_data_thread() {
24    pin_thread_to_core(2);  // Dedicated core
25    
26    while (running) {
27        // Process market data
28    }
29}
30

Rust Thread Pinning #

rust

1use core_affinity;
2
3fn market_data_thread() {
4    // Pin to core 2
5    let core_ids = core_affinity::get_core_ids().unwrap();
6    core_affinity::set_for_current(core_ids[2]);
7    
8    loop {
9        // Process market data
10    }
11}
12

SIMD for Vectorized Operations #

C++ SIMD with AVX2 #

cpp

1#include <immintrin.h>
2
3// Calculate 8 prices in parallel
4void calculate_mid_prices_simd(
5    const double* bids,
6    const double* asks,
7    double* mids,
8    size_t count
9) {
10    size_t i = 0;
11    
12    // Process 4 doubles at a time with AVX
13    for (; i + 4 <= count; i += 4) {
14        __m256d bid_vec = _mm256_loadu_pd(&bids[i]);
15        __m256d ask_vec = _mm256_loadu_pd(&asks[i]);
16        
17        __m256d sum = _mm256_add_pd(bid_vec, ask_vec);
18        __m256d mid = _mm256_mul_pd(sum, _mm256_set1_pd(0.5));
19        
20        _mm256_storeu_pd(&mids[i], mid);
21    }
22    
23    // Handle remainder
24    for (; i < count; ++i) {
25        mids[i] = (bids[i] + asks[i]) * 0.5;
26    }
27}
28

Rust SIMD with packed_simd #

rust

1use packed_simd::f64x4;
2
3fn calculate_mid_prices_simd(
4    bids: &[f64],
5    asks: &[f64],
6    mids: &mut [f64],
7) {
8    let chunks = bids.len() / 4;
9    
10    for i in 0..chunks {
11        let bid_vec = f64x4::from_slice_unaligned(&bids[i * 4..]);
12        let ask_vec = f64x4::from_slice_unaligned(&asks[i * 4..]);
13        
14        let sum = bid_vec + ask_vec;
15        let mid = sum * 0.5;
16        
17        mid.write_to_slice_unaligned(&mut mids[i * 4..]);
18    }
19    
20    // Handle remainder
21    let remainder_start = chunks * 4;
22    for i in remainder_start..bids.len() {
23        mids[i] = (bids[i] + asks[i]) * 0.5;
24    }
25}
26

Network Optimization: Kernel Bypass #

C++ with DPDK (Data Plane Development Kit)#

cpp

1#include <rte_eal.h>
2#include <rte_ethdev.h>
3#include <rte_mbuf.h>
4
5class DPDKReceiver {
6    uint16_t port_id_;
7    rte_mempool* mbuf_pool_;
8    
9public:
10    DPDKReceiver(uint16_t port_id) : port_id_(port_id) {
11        // Initialize DPDK
12        const char* argv[] = {"app", "-l", "0-3", "-n", "4"};
13        int argc = sizeof(argv) / sizeof(argv[0]);
14        rte_eal_init(argc, const_cast<char**>(argv));
15        
16        // Create memory pool for packets
17        mbuf_pool_ = rte_pktmbuf_pool_create(
18            "mbuf_pool",
19            8192,           // Number of elements
20            256,            // Cache size
21            0,              // Private data size
22            RTE_MBUF_DEFAULT_BUF_SIZE,
23            rte_socket_id()
24        );
25        
26        // Configure port
27        rte_eth_conf port_conf = {};
28        rte_eth_dev_configure(port_id_, 1, 1, &port_conf);
29        
30        // Setup RX queue
31        rte_eth_rx_queue_setup(port_id_, 0, 128, 
32            rte_eth_dev_socket_id(port_id_), nullptr, mbuf_pool_);
33        
34        // Start device
35        rte_eth_dev_start(port_id_);
36    }
37    
38    void receive_packets() {
39        constexpr uint16_t BURST_SIZE = 32;
40        rte_mbuf* packets[BURST_SIZE];
41        
42        while (true) {
43            uint16_t nb_rx = rte_eth_rx_burst(port_id_, 0, packets, BURST_SIZE);
44            
45            for (uint16_t i = 0; i < nb_rx; ++i) {
46                process_packet(packets[i]);
47                rte_pktmbuf_free(packets[i]);
48            }
49        }
50    }
51    
52    void process_packet(rte_mbuf* pkt) {
53        // Zero-copy access to packet data
54        uint8_t* data = rte_pktmbuf_mtod(pkt, uint8_t*);
55        size_t len = rte_pktmbuf_pkt_len(pkt);
56        
57        // Parse and process market data directly
58        parse_market_data(data, len);
59    }
60};
61

Rust with io_uring #

rust

1use io_uring::{opcode, types, IoUring, squeue};
2use std::os::unix::io::AsRawFd;
3use std::net::UdpSocket;
4
5struct UringReceiver {
6    ring: IoUring,
7    socket: UdpSocket,
8    buffers: Vec<Vec<u8>>,
9}
10
11impl UringReceiver {
12    fn new(socket: UdpSocket, buffer_count: usize) -> io::Result<Self> {
13        let ring = IoUring::builder()
14            .setup_sqpoll(1000)  // Kernel thread polls
15            .build(256)?;
16        
17        let buffers = (0..buffer_count)
18            .map(|_| vec![0u8; 9000])  // MTU size
19            .collect();
20        
21        Ok(Self { ring, socket, buffers })
22    }
23    
24    fn receive_loop(&mut self) -> io::Result<()> {
25        let fd = types::Fd(self.socket.as_raw_fd());
26        
27        // Submit initial receive operations
28        for (i, buf) in self.buffers.iter_mut().enumerate() {
29            let recv_e = opcode::Recv::new(fd, buf.as_mut_ptr(), buf.len() as u32)
30                .build()
31                .user_data(i as u64);
32            
33            unsafe {
34                self.ring.submission().push(&recv_e)?;
35            }
36        }
37        
38        self.ring.submit()?;
39        
40        loop {
41            // Wait for completions
42            self.ring.submit_and_wait(1)?;
43            
44            for cqe in self.ring.completion() {
45                let buf_idx = cqe.user_data() as usize;
46                let bytes_read = cqe.result() as usize;
47                
48                if bytes_read > 0 {
49                    // Process packet
50                    self.process_packet(&self.buffers[buf_idx][..bytes_read]);
51                    
52                    // Resubmit receive
53                    let buf = &mut self.buffers[buf_idx];
54                    let recv_e = opcode::Recv::new(fd, buf.as_mut_ptr(), buf.len() as u32)
55                        .build()
56                        .user_data(buf_idx as u64);
57                    
58                    unsafe {
59                        self.ring.submission().push(&recv_e)?;
60                    }
61                }
62            }
63            
64            self.ring.submit()?;
65        }
66    }
67    
68    fn process_packet(&self, data: &[u8]) {
69        // Parse market data
70    }
71}
72

Benchmark Results #

Test Setup #

Hardware: Intel Xeon E-2288G (8 cores, 3.7GHz)
Network: Mellanox ConnectX-6 (100Gbps)
OS: Ubuntu 22.04 RT kernel
Compiler: GCC 13.2 / rustc 1.75

Latency Comparison (p50/p99/p99.9 in nanoseconds)#

Operation	C++ Baseline	C++ Optimized	Rust Baseline	Rust Optimized
Order Creation	85/120/250	25/35/50	90/130/280	28/38/55
Market Data Parse	150/220/450	45/65/95	160/230/480	48/68/100
Risk Check	200/350/800	60/85/120	210/360/850	62/88/125
Order Send	300/500/1200	120/180/280	310/520/1300	125/185/290