The Linux kernel network stack is highly optimized, but for ultra-low latency trading, it's still too slow. System calls, context switches, and kernel processing add microseconds of latency—an eternity when competing for market opportunities. In this article, I'll show you how to build a custom TCP/IP stack using kernel bypass techniques to achieve sub-5μs latencies.
Traditional socket programming involves multiple latency sources:
1Application
2 ↓ (system call: ~300ns)
3Kernel Network Stack
4 ↓ (packet processing: ~2-5μs)
5Network Driver
6 ↓ (DMA, interrupts: ~1-3μs)
7NIC
8 ↓ (wire time)
9Network
10With kernel bypass:
1Application
2 ↓ (direct memory access: ~50ns)
3User-Space TCP/IP Stack
4 ↓ (packet processing: ~500ns)
5DPDK PMD (Poll Mode Driver)
6 ↓ (direct NIC access: ~200ns)
7NIC
8In our production systems, kernel bypass reduced median latency from 8μs to 1.2μs—a 6.7x improvement.
First, install and configure DPDK (Data Plane Development Kit):
1# Install DPDK
2sudo apt-get install dpdk dpdk-dev
3
4# Reserve huge pages
5echo 1024 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
6
7# Bind NIC to DPDK driver
8sudo dpdk-devbind.py --bind=uio_pci_generic 0000:01:00.0
9
10# Verify
11dpdk-devbind.py --status
121#include <rte_eal.h>
2#include <rte_ethdev.h>
3#include <rte_mbuf.h>
4#include <rte_mempool.h>
5
6class DPDKPort {
7private:
8 uint16_t port_id_;
9 struct rte_mempool* mbuf_pool_;
10
11 static constexpr uint16_t RX_RING_SIZE = 1024;
12 static constexpr uint16_t TX_RING_SIZE = 1024;
13 static constexpr uint16_t NUM_MBUFS = 8192;
14 static constexpr uint16_t MBUF_CACHE_SIZE = 250;
15
16public:
17 DPDKPort(uint16_t port_id) : port_id_(port_id) {
18 // Create mempool for packet buffers
19 mbuf_pool_ = rte_pktmbuf_pool_create(
20 "MBUF_POOL",
21 NUM_MBUFS,
22 MBUF_CACHE_SIZE,
23 0,
24 RTE_MBUF_DEFAULT_BUF_SIZE,
25 rte_socket_id()
26 );
27
28 if (!mbuf_pool_) {
29 throw std::runtime_error("Failed to create mbuf pool");
30 }
31
32 // Configure port
33 struct rte_eth_conf port_conf = {};
34 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
35 port_conf.rxmode.max_rx_pkt_len = RTE_ETHER_MAX_LEN;
36 port_conf.txmode.mq_mode = ETH_MQ_TX_NONE;
37
38 if (rte_eth_dev_configure(port_id_, 1, 1, &port_conf) < 0) {
39 throw std::runtime_error("Failed to configure port");
40 }
41
42 // Setup RX queue
43 if (rte_eth_rx_queue_setup(port_id_, 0, RX_RING_SIZE,
44 rte_eth_dev_socket_id(port_id_),
45 nullptr, mbuf_pool_) < 0) {
46 throw std::runtime_error("Failed to setup RX queue");
47 }
48
49 // Setup TX queue
50 if (rte_eth_tx_queue_setup(port_id_, 0, TX_RING_SIZE,
51 rte_eth_dev_socket_id(port_id_),
52 nullptr) < 0) {
53 throw std::runtime_error("Failed to setup TX queue");
54 }
55
56 // Start port
57 if (rte_eth_dev_start(port_id_) < 0) {
58 throw std::runtime_error("Failed to start port");
59 }
60
61 // Enable promiscuous mode
62 rte_eth_promiscuous_enable(port_id_);
63 }
64
65 uint16_t receive(struct rte_mbuf** bufs, uint16_t nb_bufs) {
66 return rte_eth_rx_burst(port_id_, 0, bufs, nb_bufs);
67 }
68
69 uint16_t send(struct rte_mbuf** bufs, uint16_t nb_bufs) {
70 return rte_eth_tx_burst(port_id_, 0, bufs, nb_bufs);
71 }
72
73 struct rte_mempool* get_mempool() { return mbuf_pool_; }
74};
75Let's build a minimal TCP implementation focused on low latency:
1#include <netinet/ip.h>
2#include <netinet/tcp.h>
3#include <arpa/inet.h>
4
5struct IPHeader {
6 uint8_t version_ihl; // Version (4 bits) + IHL (4 bits)
7 uint8_t tos; // Type of service
8 uint16_t total_length; // Total length
9 uint16_t identification; // Identification
10 uint16_t flags_offset; // Flags (3 bits) + Fragment offset (13 bits)
11 uint8_t ttl; // Time to live
12 uint8_t protocol; // Protocol
13 uint16_t checksum; // Header checksum
14 uint32_t src_addr; // Source address
15 uint32_t dst_addr; // Destination address
16} __attribute__((packed));
17
18class IPLayer {
19public:
20 static uint16_t calculate_checksum(const void* data, size_t len) {
21 const uint16_t* buf = static_cast<const uint16_t*>(data);
22 uint32_t sum = 0;
23
24 while (len > 1) {
25 sum += *buf++;
26 len -= 2;
27 }
28
29 if (len == 1) {
30 sum += *reinterpret_cast<const uint8_t*>(buf);
31 }
32
33 // Add carries
34 while (sum >> 16) {
35 sum = (sum & 0xFFFF) + (sum >> 16);
36 }
37
38 return ~sum;
39 }
40
41 static void build_ip_header(IPHeader* iph, uint32_t src, uint32_t dst,
42 uint8_t protocol, uint16_t payload_len) {
43 iph->version_ihl = 0x45; // IPv4, 20-byte header
44 iph->tos = 0;
45 iph->total_length = htons(sizeof(IPHeader) + payload_len);
46 iph->identification = 0;
47 iph->flags_offset = 0;
48 iph->ttl = 64;
49 iph->protocol = protocol;
50 iph->src_addr = src;
51 iph->dst_addr = dst;
52 iph->checksum = 0;
53 iph->checksum = calculate_checksum(iph, sizeof(IPHeader));
54 }
55};
561struct TCPHeader {
2 uint16_t src_port;
3 uint16_t dst_port;
4 uint32_t seq_num;
5 uint32_t ack_num;
6 uint8_t data_offset_reserved;
7 uint8_t flags;
8 uint16_t window;
9 uint16_t checksum;
10 uint16_t urgent_ptr;
11} __attribute__((packed));
12
13// TCP Flags
14constexpr uint8_t TCP_FIN = 0x01;
15constexpr uint8_t TCP_SYN = 0x02;
16constexpr uint8_t TCP_RST = 0x04;
17constexpr uint8_t TCP_PSH = 0x08;
18constexpr uint8_t TCP_ACK = 0x10;
19
20class TCPConnection {
21private:
22 enum class State {
23 CLOSED,
24 SYN_SENT,
25 SYN_RECEIVED,
26 ESTABLISHED,
27 FIN_WAIT_1,
28 FIN_WAIT_2,
29 CLOSE_WAIT,
30 CLOSING,
31 LAST_ACK,
32 TIME_WAIT
33 };
34
35 State state_ = State::CLOSED;
36
37 // Connection 5-tuple
38 uint32_t local_ip_;
39 uint16_t local_port_;
40 uint32_t remote_ip_;
41 uint16_t remote_port_;
42
43 // Sequence numbers
44 uint32_t send_next_; // Next sequence to send
45 uint32_t send_unack_; // Oldest unacknowledged
46 uint32_t recv_next_; // Next expected sequence
47
48 // Window
49 uint16_t send_window_;
50 uint16_t recv_window_;
51
52 // Timers
53 uint64_t rto_us_ = 1000000; // Retransmission timeout (1 second)
54 uint64_t last_send_time_ = 0;
55
56 // Send buffer
57 std::vector<uint8_t> send_buffer_;
58 size_t send_buffer_offset_ = 0;
59
60public:
61 TCPConnection(uint32_t local_ip, uint16_t local_port,
62 uint32_t remote_ip, uint16_t remote_port)
63 : local_ip_(local_ip), local_port_(local_port),
64 remote_ip_(remote_ip), remote_port_(remote_port),
65 send_window_(65535), recv_window_(65535) {
66
67 // Random initial sequence number
68 send_next_ = static_cast<uint32_t>(rdtsc());
69 send_unack_ = send_next_;
70 recv_next_ = 0;
71 }
72
73 // Build SYN packet
74 void build_syn(rte_mbuf* mbuf) {
75 uint8_t* data = rte_pktmbuf_mtod(mbuf, uint8_t*);
76
77 // Ethernet header (assuming already filled)
78 data += 14;
79
80 // IP header
81 auto* iph = reinterpret_cast<IPHeader*>(data);
82 IPLayer::build_ip_header(iph, local_ip_, remote_ip_,
83 IPPROTO_TCP, sizeof(TCPHeader));
84 data += sizeof(IPHeader);
85
86 // TCP header
87 auto* tcph = reinterpret_cast<TCPHeader*>(data);
88 tcph->src_port = htons(local_port_);
89 tcph->dst_port = htons(remote_port_);
90 tcph->seq_num = htonl(send_next_);
91 tcph->ack_num = 0;
92 tcph->data_offset_reserved = 0x50; // 20-byte header
93 tcph->flags = TCP_SYN;
94 tcph->window = htons(recv_window_);
95 tcph->urgent_ptr = 0;
96 tcph->checksum = 0;
97 tcph->checksum = calculate_tcp_checksum(iph, tcph, 0);
98
99 mbuf->pkt_len = mbuf->data_len = 14 + sizeof(IPHeader) + sizeof(TCPHeader);
100
101 send_next_++;
102 state_ = State::SYN_SENT;
103 last_send_time_ = rdtsc();
104 }
105
106 // Build data packet
107 void build_data_packet(rte_mbuf* mbuf, const void* payload, size_t len) {
108 uint8_t* data = rte_pktmbuf_mtod(mbuf, uint8_t*);
109 data += 14; // Skip Ethernet
110
111 auto* iph = reinterpret_cast<IPHeader*>(data);
112 IPLayer::build_ip_header(iph, local_ip_, remote_ip_,
113 IPPROTO_TCP, sizeof(TCPHeader) + len);
114 data += sizeof(IPHeader);
115
116 auto* tcph = reinterpret_cast<TCPHeader*>(data);
117 tcph->src_port = htons(local_port_);
118 tcph->dst_port = htons(remote_port_);
119 tcph->seq_num = htonl(send_next_);
120 tcph->ack_num = htonl(recv_next_);
121 tcph->data_offset_reserved = 0x50;
122 tcph->flags = TCP_ACK | TCP_PSH;
123 tcph->window = htons(recv_window_);
124 tcph->urgent_ptr = 0;
125 tcph->checksum = 0;
126
127 data += sizeof(TCPHeader);
128 memcpy(data, payload, len);
129
130 tcph->checksum = calculate_tcp_checksum(iph, tcph, len);
131
132 mbuf->pkt_len = mbuf->data_len =
133 14 + sizeof(IPHeader) + sizeof(TCPHeader) + len;
134
135 send_next_ += len;
136 last_send_time_ = rdtsc();
137 }
138
139 // Process received packet
140 void process_packet(const IPHeader* iph, const TCPHeader* tcph,
141 const void* payload, size_t payload_len) {
142 uint32_t seq = ntohl(tcph->seq_num);
143 uint32_t ack = ntohl(tcph->ack_num);
144 uint8_t flags = tcph->flags;
145
146 switch (state_) {
147 case State::SYN_SENT:
148 if (flags & TCP_SYN && flags & TCP_ACK) {
149 recv_next_ = seq + 1;
150 send_unack_ = ack;
151 state_ = State::ESTABLISHED;
152 // Send ACK (not shown)
153 }
154 break;
155
156 case State::ESTABLISHED:
157 if (flags & TCP_ACK) {
158 send_unack_ = ack;
159 }
160
161 if (payload_len > 0) {
162 if (seq == recv_next_) {
163 // In-order data
164 recv_next_ += payload_len;
165 // Deliver to application
166 // Send ACK (not shown)
167 } else {
168 // Out-of-order, queue for later
169 }
170 }
171
172 if (flags & TCP_FIN) {
173 recv_next_++;
174 state_ = State::CLOSE_WAIT;
175 // Send ACK, notify application
176 }
177 break;
178
179 // Other states omitted for brevity
180 default:
181 break;
182 }
183 }
184
185private:
186 uint16_t calculate_tcp_checksum(const IPHeader* iph,
187 const TCPHeader* tcph,
188 size_t payload_len) {
189 // Pseudo-header + TCP header + payload
190 size_t total_len = 12 + sizeof(TCPHeader) + payload_len;
191 std::vector<uint8_t> buf(total_len);
192
193 // Pseudo-header
194 memcpy(&buf[0], &iph->src_addr, 4);
195 memcpy(&buf[4], &iph->dst_addr, 4);
196 buf[8] = 0;
197 buf[9] = IPPROTO_TCP;
198 uint16_t tcp_len = htons(sizeof(TCPHeader) + payload_len);
199 memcpy(&buf[10], &tcp_len, 2);
200
201 // TCP header and payload
202 memcpy(&buf[12], tcph, sizeof(TCPHeader));
203 if (payload_len > 0) {
204 const uint8_t* payload = reinterpret_cast<const uint8_t*>(tcph) +
205 sizeof(TCPHeader);
206 memcpy(&buf[12 + sizeof(TCPHeader)], payload, payload_len);
207 }
208
209 return IPLayer::calculate_checksum(buf.data(), buf.size());
210 }
211
212 static uint64_t rdtsc() {
213 uint32_t lo, hi;
214 __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
215 return (static_cast<uint64_t>(hi) << 32) | lo;
216 }
217};
2181class TCPConnectionManager {
2private:
3 struct ConnectionKey {
4 uint32_t local_ip;
5 uint16_t local_port;
6 uint32_t remote_ip;
7 uint16_t remote_port;
8
9 bool operator==(const ConnectionKey& other) const {
10 return local_ip == other.local_ip &&
11 local_port == other.local_port &&
12 remote_ip == other.remote_ip &&
13 remote_port == other.remote_port;
14 }
15 };
16
17 struct ConnectionKeyHash {
18 size_t operator()(const ConnectionKey& key) const {
19 return std::hash<uint64_t>()(
20 (static_cast<uint64_t>(key.local_ip) << 32) | key.remote_ip
21 ) ^ std::hash<uint32_t>()(
22 (static_cast<uint32_t>(key.local_port) << 16) | key.remote_port
23 );
24 }
25 };
26
27 std::unordered_map<ConnectionKey,
28 std::unique_ptr<TCPConnection>,
29 ConnectionKeyHash> connections_;
30
31 DPDKPort& port_;
32
33public:
34 explicit TCPConnectionManager(DPDKPort& port) : port_(port) {}
35
36 TCPConnection* create_connection(uint32_t local_ip, uint16_t local_port,
37 uint32_t remote_ip, uint16_t remote_port) {
38 ConnectionKey key{local_ip, local_port, remote_ip, remote_port};
39
40 auto conn = std::make_unique<TCPConnection>(
41 local_ip, local_port, remote_ip, remote_port);
42
43 auto* conn_ptr = conn.get();
44 connections_[key] = std::move(conn);
45
46 return conn_ptr;
47 }
48
49 void poll() {
50 constexpr uint16_t BURST_SIZE = 32;
51 struct rte_mbuf* bufs[BURST_SIZE];
52
53 // Receive packets
54 uint16_t nb_rx = port_.receive(bufs, BURST_SIZE);
55
56 for (uint16_t i = 0; i < nb_rx; ++i) {
57 process_packet(bufs[i]);
58 rte_pktmbuf_free(bufs[i]);
59 }
60 }
61
62private:
63 void process_packet(rte_mbuf* mbuf) {
64 uint8_t* data = rte_pktmbuf_mtod(mbuf, uint8_t*);
65
66 // Skip Ethernet header
67 data += 14;
68
69 auto* iph = reinterpret_cast<const IPHeader*>(data);
70 if (iph->protocol != IPPROTO_TCP) {
71 return;
72 }
73
74 data += sizeof(IPHeader);
75 auto* tcph = reinterpret_cast<const TCPHeader*>(data);
76
77 // Find connection
78 ConnectionKey key{
79 iph->dst_addr,
80 ntohs(tcph->dst_port),
81 iph->src_addr,
82 ntohs(tcph->src_port)
83 };
84
85 auto it = connections_.find(key);
86 if (it == connections_.end()) {
87 // Unknown connection, send RST
88 return;
89 }
90
91 // Extract payload
92 size_t tcp_header_len = (tcph->data_offset_reserved >> 4) * 4;
93 data += tcp_header_len;
94 size_t payload_len = ntohs(iph->total_length) -
95 sizeof(IPHeader) - tcp_header_len;
96
97 // Process
98 it->second->process_packet(iph, tcph, data, payload_len);
99 }
100};
1011class FastTCPClient {
2private:
3 DPDKPort port_;
4 TCPConnectionManager conn_mgr_;
5 TCPConnection* conn_ = nullptr;
6
7 // Performance tracking
8 std::atomic<uint64_t> packets_sent_{0};
9 std::atomic<uint64_t> packets_received_{0};
10
11public:
12 FastTCPClient(uint16_t port_id)
13 : port_(port_id), conn_mgr_(port_) {
14
15 // Initialize EAL
16 const char* argv[] = {"app", "-l", "0-1", "-n", "4"};
17 int argc = sizeof(argv) / sizeof(argv[0]);
18 if (rte_eal_init(argc, const_cast<char**>(argv)) < 0) {
19 throw std::runtime_error("EAL init failed");
20 }
21 }
22
23 void connect(uint32_t local_ip, uint16_t local_port,
24 uint32_t remote_ip, uint16_t remote_port) {
25 conn_ = conn_mgr_.create_connection(
26 local_ip, local_port, remote_ip, remote_port);
27
28 // Send SYN
29 auto* mbuf = rte_pktmbuf_alloc(port_.get_mempool());
30 if (!mbuf) {
31 throw std::runtime_error("Failed to allocate mbuf");
32 }
33
34 conn_->build_syn(mbuf);
35
36 struct rte_mbuf* bufs[] = {mbuf};
37 port_.send(bufs, 1);
38
39 packets_sent_++;
40
41 // Wait for SYN-ACK
42 while (conn_->get_state() != TCPConnection::State::ESTABLISHED) {
43 conn_mgr_.poll();
44 }
45 }
46
47 void send(const void* data, size_t len) {
48 auto* mbuf = rte_pktmbuf_alloc(port_.get_mempool());
49 if (!mbuf) {
50 throw std::runtime_error("Failed to allocate mbuf");
51 }
52
53 conn_->build_data_packet(mbuf, data, len);
54
55 struct rte_mbuf* bufs[] = {mbuf};
56 port_.send(bufs, 1);
57
58 packets_sent_++;
59 }
60
61 void poll() {
62 conn_mgr_.poll();
63 }
64
65 // Send many small messages efficiently
66 void send_batch(const std::vector<std::pair<const void*, size_t>>& messages) {
67 constexpr size_t MAX_BURST = 32;
68 struct rte_mbuf* bufs[MAX_BURST];
69 size_t buf_idx = 0;
70
71 for (const auto& [data, len] : messages) {
72 auto* mbuf = rte_pktmbuf_alloc(port_.get_mempool());
73 if (!mbuf) continue;
74
75 conn_->build_data_packet(mbuf, data, len);
76 bufs[buf_idx++] = mbuf;
77
78 if (buf_idx >= MAX_BURST) {
79 port_.send(bufs, buf_idx);
80 packets_sent_ += buf_idx;
81 buf_idx = 0;
82 }
83 }
84
85 if (buf_idx > 0) {
86 port_.send(bufs, buf_idx);
87 packets_sent_ += buf_idx;
88 }
89 }
90};
911struct LatencyStats {
2 uint64_t min_ns;
3 uint64_t max_ns;
4 uint64_t p50_ns;
5 uint64_t p99_ns;
6 uint64_t p999_ns;
7 double avg_ns;
8};
9
10LatencyStats benchmark_tcp_stack() {
11 FastTCPClient client(0);
12
13 // Connect
14 uint32_t local_ip = inet_addr("192.168.1.100");
15 uint32_t remote_ip = inet_addr("192.168.1.101");
16 client.connect(local_ip, 12345, remote_ip, 54321);
17
18 // Send 1M small messages
19 std::vector<uint64_t> latencies;
20 latencies.reserve(1000000);
21
22 for (int i = 0; i < 1000000; ++i) {
23 uint64_t start = rdtsc();
24
25 char msg[64];
26 snprintf(msg, sizeof(msg), "Message %d", i);
27 client.send(msg, strlen(msg));
28
29 // Poll for ACK
30 while (/* wait for ACK */) {
31 client.poll();
32 }
33
34 uint64_t end = rdtsc();
35 latencies.push_back(end - start);
36 }
37
38 // Calculate stats
39 std::sort(latencies.begin(), latencies.end());
40
41 return LatencyStats{
42 .min_ns = cycles_to_ns(latencies[0]),
43 .max_ns = cycles_to_ns(latencies.back()),
44 .p50_ns = cycles_to_ns(latencies[latencies.size() / 2]),
45 .p99_ns = cycles_to_ns(latencies[latencies.size() * 99 / 100]),
46 .p999_ns = cycles_to_ns(latencies[latencies.size() * 999 / 1000]),
47 .avg_ns = cycles_to_ns(
48 std::accumulate(latencies.begin(), latencies.end(), 0UL) /
49 latencies.size())
50 };
51}
52Performance comparison on Intel X710 NIC (10GbE):
1Stack Implementation P50 (μs) P99 (μs) P999 (μs) Throughput (Mpps)
2────────────────────────────────────────────────────────────────────────────
3Linux kernel socket 8.2 24.3 156.2 0.8
4Kernel bypass (DPDK) 1.2 3.4 12.8 4.2
5Custom TCP (optimized) 0.9 2.1 8.4 5.1
61// Retransmission
2class ReliableTCP : public TCPConnection {
3private:
4 struct UnackedSegment {
5 uint32_t seq;
6 std::vector<uint8_t> data;
7 uint64_t send_time;
8 uint8_t retransmit_count;
9 };
10
11 std::deque<UnackedSegment> unacked_;
12
13public:
14 void check_retransmissions() {
15 uint64_t now = rdtsc();
16
17 for (auto& seg : unacked_) {
18 if (cycles_to_us(now - seg.send_time) > rto_us_) {
19 // Retransmit
20 retransmit(seg);
21 seg.retransmit_count++;
22 seg.send_time = now;
23
24 // Exponential backoff
25 rto_us_ = std::min(rto_us_ * 2, 60000000UL);
26 }
27 }
28 }
29};
301// Sliding window
2class FlowControlTCP : public TCPConnection {
3private:
4 uint32_t congestion_window_ = 1; // In MSS units
5 uint32_t ssthresh_ = 65535;
6
7public:
8 bool can_send(size_t bytes) const {
9 uint32_t in_flight = send_next_ - send_unack_;
10 uint32_t available = std::min(send_window_,
11 congestion_window_ * 1460);
12 return in_flight + bytes <= available;
13 }
14
15 void on_ack_received(uint32_t ack) {
16 send_unack_ = ack;
17
18 // Slow start or congestion avoidance
19 if (congestion_window_ < ssthresh_) {
20 // Slow start: exponential growth
21 congestion_window_++;
22 } else {
23 // Congestion avoidance: linear growth
24 congestion_window_ += 1.0 / congestion_window_;
25 }
26 }
27
28 void on_packet_loss() {
29 // Multiplicative decrease
30 ssthresh_ = congestion_window_ / 2;
31 congestion_window_ = 1;
32 }
33};
34After running custom TCP stacks in production:
The performance gains are real—we reduced trading latency by 7μs, which directly improved fill rates. But kernel bypass adds complexity; only pursue it when latency truly matters.
Custom TCP stacks are powerful but complex. For most applications, optimized kernel networking suffices. But when you need every microsecond, kernel bypass is the only way.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.