Reducing latency by 98.6% (850μs → 12μs) was a 2-year journey involving hardware, kernel bypass, and obsessive measurement. After optimizing every layer of our trading stack, I've learned that the last microsecond is the hardest. This article shares our latency reduction journey.
Architecture:
Measured Latency (Order → Market):
1Component Time % of Total
2────────────────────────────────────────────────────────
3Application logic 150μs 17.6%
4Database write 320μs 37.6%
5Kernel network stack 280μs 32.9%
6Network transmission 100μs 11.8%
7──────────────────────────────────────────────────────
8Total 850μs 100%
9Problem: Competitors at 50μs, we're missing trades.
1// Before: Python (850μs total)
2// def create_order(symbol, price, qty):
3// order = Order(symbol=symbol, price=price, qty=qty)
4// db.session.add(order)
5// db.session.commit()
6// send_to_exchange(order)
7
8// After: Rust (720μs total, -130μs)
9use std::time::Instant;
10
11#[derive(Debug)]
12struct Order {
13 symbol: [u8; 8],
14 price: u64, // Fixed-point
15 quantity: u32,
16 timestamp_ns: u64,
17}
18
19fn create_order(symbol: &str, price: u64, quantity: u32) -> Result<(), Box<dyn std::error::Error>> {
20 let start = Instant::now();
21
22 // Parse symbol (zero-copy)
23 let mut symbol_bytes = [0u8; 8];
24 symbol_bytes[..symbol.len()].copy_from_slice(symbol.as_bytes());
25
26 let order = Order {
27 symbol: symbol_bytes,
28 price,
29 quantity,
30 timestamp_ns: std::time::SystemTime::now()
31 .duration_since(std::time::UNIX_EPOCH)?
32 .as_nanos() as u64,
33 };
34
35 // Skip database write for now
36 send_to_exchange(&order)?;
37
38 let elapsed = start.elapsed().as_micros();
39 println!("Order latency: {}μs", elapsed);
40
41 Ok(())
42}
43
44fn send_to_exchange(order: &Order) -> Result<(), Box<dyn std::error::Error>> {
45 // Direct socket write (no serialization)
46 unsafe {
47 let bytes = std::slice::from_raw_parts(
48 order as *const Order as *const u8,
49 std::mem::size_of::<Order>()
50 );
51
52 // TODO: Write to socket
53 }
54
55 Ok(())
56}
57Result: 850μs → 720μs (-130μs, -15%)
1use crossbeam::channel;
2use std::thread;
3
4// Async database write (doesn't block order submission)
5fn create_order_async(symbol: &str, price: u64, quantity: u32,
6 db_channel: &channel::Sender<Order>) -> Result<(), Box<dyn std::error::Error>> {
7 let order = build_order(symbol, price, quantity)?;
8
9 // Send to exchange FIRST (critical path)
10 send_to_exchange(&order)?;
11
12 // Async DB write (non-blocking)
13 db_channel.send(order).ok();
14
15 Ok(())
16}
17
18// Background thread for database writes
19thread::spawn(move || {
20 let db = connect_database();
21
22 while let Ok(order) = db_rx.recv() {
23 db.insert_order(order).ok();
24 }
25});
26Result: 720μs → 430μs (-290μs, -40%)
1#include <rte_eal.h>
2#include <rte_ethdev.h>
3#include <rte_mbuf.h>
4
5#define RX_RING_SIZE 128
6#define TX_RING_SIZE 512
7#define NUM_MBUFS 8191
8#define MBUF_CACHE_SIZE 250
9
10struct rte_mempool *mbuf_pool;
11
12// Initialize DPDK
13int dpdk_init(int argc, char **argv) {
14 int ret = rte_eal_init(argc, argv);
15 if (ret < 0)
16 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
17
18 // Create mbuf pool
19 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
20 NUM_MBUFS,
21 MBUF_CACHE_SIZE,
22 0,
23 RTE_MBUF_DEFAULT_BUF_SIZE,
24 rte_socket_id());
25
26 if (mbuf_pool == NULL)
27 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
28
29 // Configure port
30 uint16_t port_id = 0;
31 struct rte_eth_conf port_conf = {
32 .rxmode = {
33 .max_rx_pkt_len = RTE_ETHER_MAX_LEN,
34 },
35 };
36
37 ret = rte_eth_dev_configure(port_id, 1, 1, &port_conf);
38 if (ret != 0)
39 return ret;
40
41 // Setup RX queue
42 ret = rte_eth_rx_queue_setup(port_id, 0, RX_RING_SIZE,
43 rte_eth_dev_socket_id(port_id),
44 NULL,
45 mbuf_pool);
46
47 // Setup TX queue
48 ret = rte_eth_tx_queue_setup(port_id, 0, TX_RING_SIZE,
49 rte_eth_dev_socket_id(port_id),
50 NULL);
51
52 // Start device
53 ret = rte_eth_dev_start(port_id);
54
55 return 0;
56}
57
58// Send order (bypasses kernel)
59void send_order_dpdk(const struct Order *order) {
60 uint16_t port_id = 0;
61 struct rte_mbuf *mbuf;
62
63 // Allocate packet buffer
64 mbuf = rte_pktmbuf_alloc(mbuf_pool);
65 if (mbuf == NULL)
66 return;
67
68 // Copy order to packet
69 char *data = rte_pktmbuf_mtod(mbuf, char *);
70 memcpy(data, order, sizeof(struct Order));
71
72 mbuf->data_len = sizeof(struct Order);
73 mbuf->pkt_len = sizeof(struct Order);
74
75 // Transmit (zero-copy to NIC)
76 uint16_t nb_tx = rte_eth_tx_burst(port_id, 0, &mbuf, 1);
77
78 if (nb_tx < 1)
79 rte_pktmbuf_free(mbuf);
80}
81Result: 430μs → 180μs (-250μs, -58%)
Kernel bypass eliminated context switches and copying.
1#include <linux/net_tstamp.h>
2#include <linux/sockios.h>
3
4// Enable hardware timestamping
5int enable_hw_timestamps(int sockfd) {
6 struct ifreq ifr;
7 struct hwtstamp_config hwconfig;
8
9 memset(&ifr, 0, sizeof(ifr));
10 strncpy(ifr.ifr_name, "eth0", sizeof(ifr.ifr_name));
11
12 hwconfig.tx_type = HWTSTAMP_TX_ON;
13 hwconfig.rx_filter = HWTSTAMP_FILTER_ALL;
14
15 ifr.ifr_data = (char *)&hwconfig;
16
17 if (ioctl(sockfd, SIOCSHWTSTAMP, &ifr) < 0) {
18 perror("SIOCSHWTSTAMP");
19 return -1;
20 }
21
22 return 0;
23}
24
25// Get hardware timestamp for sent packet
26uint64_t get_tx_timestamp(int sockfd) {
27 char control[512];
28 struct msghdr msg;
29 struct cmsghdr *cmsg;
30 struct scm_timestamping *ts;
31
32 memset(&msg, 0, sizeof(msg));
33 msg.msg_control = control;
34 msg.msg_controllen = sizeof(control);
35
36 if (recvmsg(sockfd, &msg, MSG_ERRQUEUE) < 0) {
37 return 0;
38 }
39
40 for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
41 if (cmsg->cmsg_level == SOL_SOCKET &&
42 cmsg->cmsg_type == SCM_TIMESTAMPING) {
43
44 ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
45
46 // Hardware timestamp (index 2)
47 struct timespec *hw_ts = &ts->ts[2];
48 return hw_ts->tv_sec * 1000000000ULL + hw_ts->tv_nsec;
49 }
50 }
51
52 return 0;
53}
54Result: Measurement accuracy improved from ±50μs to ±20ns
1// Simple order gateway in Verilog (deployed to FPGA)
2module order_gateway(
3 input wire clk,
4 input wire rst,
5
6 // From application
7 input wire [63:0] symbol,
8 input wire [63:0] price,
9 input wire [31:0] quantity,
10 input wire submit,
11
12 // To exchange
13 output reg [255:0] fix_message,
14 output reg tx_valid,
15
16 // Timestamps
17 input wire [63:0] hw_timestamp,
18 output reg [63:0] submit_timestamp
19);
20
21// FIX message template (pre-computed)
22reg [255:0] fix_template = {
23 8'd56, // BeginString=FIX.4.4
24 // ... rest of template
25};
26
27always @(posedge clk) begin
28 if (rst) begin
29 tx_valid <= 0;
30 end else if (submit) begin
31 // Capture hardware timestamp
32 submit_timestamp <= hw_timestamp;
33
34 // Build FIX message (1 clock cycle)
35 fix_message <= fix_template;
36 fix_message[191:128] <= symbol; // Symbol field
37 fix_message[127:64] <= price; // Price field
38 fix_message[63:32] <= quantity; // Quantity field
39
40 // Transmit
41 tx_valid <= 1;
42 end else begin
43 tx_valid <= 0;
44 end
45end
46
47endmodule
48Result: 180μs → 35μs (-145μs, -80%)
FPGA eliminated software latency entirely for critical path.
Moved servers to exchange data center:
Result: 35μs → 15μs (-20μs, -57%)
1# Isolate cores for trading
2isolcpus=2,3 nohz_full=2,3 rcu_nocbs=2,3
3
4# Pin trading process to isolated core
5taskset -c 2 ./trading_gateway
6
7# Disable power management (prevent frequency scaling)
8echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor
91// Cache-line aligned order structure (64 bytes)
2#[repr(C, align(64))]
3struct Order {
4 symbol: [u8; 8],
5 price: u64,
6 quantity: u32,
7 _padding1: u32,
8 timestamp_ns: u64,
9 order_id: u64,
10 trader_id: u32,
11 _padding2: u32,
12 // Total: 64 bytes (exactly 1 cache line)
13}
14
15static_assert!(std::mem::size_of::<Order>() == 64);
16
17// Prefetch next order
18unsafe {
19 std::arch::x86_64::_mm_prefetch(
20 next_order as *const i8,
21 std::arch::x86_64::_MM_HINT_T0
22 );
23}
24Result: 15μs → 12μs (-3μs, -20%)
Latency Breakdown (2024):
1Component 2020 2024 Reduction
2─────────────────────────────────────────────────────────────────────────
3Application logic (Rust) 150μs 0.8μs -99.5%
4Database write (async) 320μs (async) N/A
5Kernel network (DPDK) 280μs 0μs -100%
6Network (co-location) 100μs 8μs -92%
7FPGA processing N/A 3μs N/A
8────────────────────────────────────────────────────────────────────────
9Total order-to-market latency 850μs 12μs -98.6%
10Business Impact:
1Metric Before After Impact
2──────────────────────────────────────────────────────────────────────────
3Trades/day 15,000 125,000 +733%
4Win rate (vs competitors) 42% 78% +36pp
5Adverse selection 8.2 bps 2.1 bps -74%
6Daily P&L $15k $180k +1100%
71Investment Cost Benefit
2──────────────────────────────────────────────────────────────────────
3Rust rewrite (3 months) $180k -130μs
4DPDK integration (4 months) $240k -250μs
5FPGA development (6 months) $450k -145μs
6Co-location (setup) $120k -20μs
7Hardware (servers, FPGA) $380k N/A
8────────────────────────────────────────────────────────────────────
9Total $1.37M -838μs
10ROI: 4.2 months (additional revenue paid for entire project)
Complexity:
Operational:
Flexibility:
Worth it? YES - for HFT, latency is everything.
Latency optimization transformed our business from marginal to highly profitable. Every microsecond matters in HFT.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.