After implementing kernel bypass for a trading system processing 2.4M packets/sec with 850ns median latency, I've learned that eliminating kernel overhead is critical for sub-microsecond performance—but complexity increases dramatically. This article compares DPDK, io_uring, and XDP with production metrics.
Traditional kernel networking:
Kernel bypass benefits:
Our latency metrics (production):
Data Plane Development Kit provides complete userspace networking.
1// dpdk_market_data.c
2#include <rte_eal.h>
3#include <rte_ethdev.h>
4#include <rte_mbuf.h>
5#include <rte_cycles.h>
6#include <rte_lcore.h>
7#include <rte_ring.h>
8
9#include <stdio.h>
10#include <stdint.h>
11#include <stdbool.h>
12#include <arpa/inet.h>
13
14#define RX_RING_SIZE 1024
15#define TX_RING_SIZE 1024
16#define NUM_MBUFS 8191
17#define MBUF_CACHE_SIZE 250
18#define BURST_SIZE 32
19
20// Market data packet structure
21struct __attribute__((packed)) market_data_msg {
22 uint32_t sequence;
23 uint64_t timestamp;
24 char symbol[8];
25 uint64_t price; // Fixed point: price * 10000
26 uint64_t quantity;
27 uint8_t side; // 0=bid, 1=ask
28};
29
30// Statistics
31struct port_stats {
32 uint64_t rx_packets;
33 uint64_t rx_bytes;
34 uint64_t rx_drops;
35 uint64_t processing_cycles;
36 uint64_t min_latency_ns;
37 uint64_t max_latency_ns;
38 uint64_t total_latency_ns;
39};
40
41static struct port_stats stats = {0};
42
43// Port configuration
44static const struct rte_eth_conf port_conf_default = {
45 .rxmode = {
46 .max_rx_pkt_len = RTE_ETHER_MAX_LEN,
47 .mq_mode = ETH_MQ_RX_RSS,
48 },
49 .rx_adv_conf = {
50 .rss_conf = {
51 .rss_key = NULL,
52 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP,
53 },
54 },
55 .txmode = {
56 .mq_mode = ETH_MQ_TX_NONE,
57 },
58};
59
60/**
61 * Initialize a network port
62 */
63static int port_init(uint16_t port, struct rte_mempool *mbuf_pool)
64{
65 struct rte_eth_conf port_conf = port_conf_default;
66 const uint16_t rx_rings = 1, tx_rings = 1;
67 uint16_t nb_rxd = RX_RING_SIZE;
68 uint16_t nb_txd = TX_RING_SIZE;
69 int retval;
70 struct rte_eth_dev_info dev_info;
71 struct rte_eth_txconf txconf;
72
73 if (!rte_eth_dev_is_valid_port(port))
74 return -1;
75
76 retval = rte_eth_dev_info_get(port, &dev_info);
77 if (retval != 0) {
78 printf("Error getting device info: %s\n", strerror(-retval));
79 return retval;
80 }
81
82 // Configure device
83 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
84 if (retval != 0)
85 return retval;
86
87 // Adjust ring sizes
88 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd);
89 if (retval != 0)
90 return retval;
91
92 // Setup RX queue
93 retval = rte_eth_rx_queue_setup(port, 0, nb_rxd,
94 rte_eth_dev_socket_id(port), NULL, mbuf_pool);
95 if (retval < 0)
96 return retval;
97
98 // Setup TX queue
99 txconf = dev_info.default_txconf;
100 txconf.offloads = port_conf.txmode.offloads;
101 retval = rte_eth_tx_queue_setup(port, 0, nb_txd,
102 rte_eth_dev_socket_id(port), &txconf);
103 if (retval < 0)
104 return retval;
105
106 // Start device
107 retval = rte_eth_dev_start(port);
108 if (retval < 0)
109 return retval;
110
111 // Enable promiscuous mode
112 retval = rte_eth_promiscuous_enable(port);
113 if (retval != 0)
114 return retval;
115
116 return 0;
117}
118
119/**
120 * Process market data packet
121 */
122static inline void process_market_data(
123 struct market_data_msg *msg,
124 uint64_t arrival_time
125)
126{
127 uint64_t latency_cycles = rte_get_tsc_cycles() - arrival_time;
128 uint64_t latency_ns = (latency_cycles * 1000000000ULL) / rte_get_tsc_hz();
129
130 // Update statistics
131 stats.processing_cycles += latency_cycles;
132 stats.total_latency_ns += latency_ns;
133
134 if (latency_ns < stats.min_latency_ns || stats.min_latency_ns == 0)
135 stats.min_latency_ns = latency_ns;
136 if (latency_ns > stats.max_latency_ns)
137 stats.max_latency_ns = latency_ns;
138
139 // Process message (e.g., update order book)
140 // In production: lock-free order book update
141 double price = (double)msg->price / 10000.0;
142
143 // Example: just print for demo
144 if (stats.rx_packets % 100000 == 0) {
145 printf("Seq %u: %s %.4f x %lu [latency: %lu ns]\n",
146 msg->sequence, msg->symbol, price,
147 msg->quantity, latency_ns);
148 }
149}
150
151/**
152 * Main packet processing loop
153 */
154static int lcore_main(void *arg)
155{
156 uint16_t port = *(uint16_t *)arg;
157
158 printf("Core %u processing packets from port %u\n",
159 rte_lcore_id(), port);
160
161 struct rte_mbuf *bufs[BURST_SIZE];
162
163 while (true) {
164 // Receive burst of packets
165 const uint16_t nb_rx = rte_eth_rx_burst(port, 0, bufs, BURST_SIZE);
166
167 if (unlikely(nb_rx == 0))
168 continue;
169
170 // Process each packet
171 for (uint16_t i = 0; i < nb_rx; i++) {
172 struct rte_mbuf *m = bufs[i];
173
174 // Get timestamp immediately
175 uint64_t arrival_time = rte_get_tsc_cycles();
176
177 // Parse Ethernet header
178 struct rte_ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
179
180 // Check for IPv4
181 if (eth_hdr->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) {
182 struct rte_ipv4_hdr *ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
183
184 // Check for UDP
185 if (ip_hdr->next_proto_id == IPPROTO_UDP) {
186 struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *)(
187 (unsigned char *)ip_hdr + sizeof(struct rte_ipv4_hdr)
188 );
189
190 // Extract payload
191 struct market_data_msg *msg = (struct market_data_msg *)(udp_hdr + 1);
192
193 // Process message
194 process_market_data(msg, arrival_time);
195
196 stats.rx_packets++;
197 stats.rx_bytes += m->pkt_len;
198 }
199 }
200
201 // Free packet buffer
202 rte_pktmbuf_free(m);
203 }
204 }
205
206 return 0;
207}
208
209/**
210 * Print statistics
211 */
212static void print_stats(void)
213{
214 printf("\n=== DPDK Statistics ===\n");
215 printf("RX Packets: %lu\n", stats.rx_packets);
216 printf("RX Bytes: %lu\n", stats.rx_bytes);
217 printf("RX Drops: %lu\n", stats.rx_drops);
218
219 if (stats.rx_packets > 0) {
220 uint64_t avg_latency = stats.total_latency_ns / stats.rx_packets;
221 printf("Min Latency: %lu ns\n", stats.min_latency_ns);
222 printf("Avg Latency: %lu ns\n", avg_latency);
223 printf("Max Latency: %lu ns\n", stats.max_latency_ns);
224
225 double mpps = (double)stats.rx_packets / 1000000.0;
226 printf("Throughput: %.2f Mpps\n", mpps);
227 }
228}
229
230int main(int argc, char *argv[])
231{
232 struct rte_mempool *mbuf_pool;
233 unsigned nb_ports;
234 uint16_t portid = 0;
235
236 // Initialize EAL
237 int ret = rte_eal_init(argc, argv);
238 if (ret < 0)
239 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
240
241 argc -= ret;
242 argv += ret;
243
244 nb_ports = rte_eth_dev_count_avail();
245 if (nb_ports < 1)
246 rte_exit(EXIT_FAILURE, "No Ethernet ports available\n");
247
248 printf("Found %u ports\n", nb_ports);
249
250 // Create mbuf pool
251 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", NUM_MBUFS,
252 MBUF_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
253
254 if (mbuf_pool == NULL)
255 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
256
257 // Initialize port
258 if (port_init(portid, mbuf_pool) != 0)
259 rte_exit(EXIT_FAILURE, "Cannot init port %u\n", portid);
260
261 printf("Port %u initialized successfully\n", portid);
262
263 // Launch processing on lcore
264 rte_eal_remote_launch(lcore_main, &portid, rte_get_next_lcore(-1, 1, 0));
265
266 // Print stats every 5 seconds
267 while (true) {
268 sleep(5);
269 print_stats();
270 }
271
272 // Cleanup
273 rte_eth_dev_stop(portid);
274 rte_eth_dev_close(portid);
275 rte_eal_cleanup();
276
277 return 0;
278}
2791# Makefile for DPDK application
2DPDK_PATH = /usr/local/share/dpdk
3
4PKGCONF = pkg-config
5
6CFLAGS += -O3 -march=native
7CFLAGS += $(shell $(PKGCONF) --cflags libdpdk)
8LDFLAGS += $(shell $(PKGCONF) --libs libdpdk)
9
10TARGET = dpdk_market_data
11
12all: $(TARGET)
13
14$(TARGET): dpdk_market_data.c
15 $(CC) $(CFLAGS) $< -o $@ $(LDFLAGS)
16
17clean:
18 rm -f $(TARGET)
19
20# Run with huge pages and isolated cores
21run: $(TARGET)
22 sudo ./$(TARGET) -l 2-3 -n 4 --proc-type=primary -- -p 0x1
23Linux async I/O without system call overhead.
1// io_uring_receiver.c
2#include <liburing.h>
3#include <sys/socket.h>
4#include <netinet/in.h>
5#include <arpa/inet.h>
6#include <stdio.h>
7#include <stdlib.h>
8#include <string.h>
9#include <unistd.h>
10#include <time.h>
11
12#define QUEUE_DEPTH 256
13#define BUFFER_SIZE 2048
14#define NUM_BUFFERS 512
15
16struct app_context {
17 struct io_uring ring;
18 int sockfd;
19
20 // Buffer pool
21 void *buffers;
22 int buffer_size;
23 int num_buffers;
24
25 // Statistics
26 uint64_t packets_received;
27 uint64_t total_latency_ns;
28 uint64_t min_latency_ns;
29 uint64_t max_latency_ns;
30};
31
32static inline uint64_t get_time_ns(void)
33{
34 struct timespec ts;
35 clock_gettime(CLOCK_MONOTONIC, &ts);
36 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
37}
38
39/**
40 * Initialize io_uring
41 */
42int init_io_uring(struct app_context *ctx)
43{
44 struct io_uring_params params;
45
46 memset(¶ms, 0, sizeof(params));
47
48 // Use SQ polling to avoid system calls
49 params.flags = IORING_SETUP_SQPOLL;
50 params.sq_thread_idle = 1000; // 1 second idle timeout
51
52 int ret = io_uring_queue_init_params(QUEUE_DEPTH, &ctx->ring, ¶ms);
53 if (ret < 0) {
54 fprintf(stderr, "io_uring_queue_init: %s\n", strerror(-ret));
55 return ret;
56 }
57
58 // Register buffer pool
59 struct iovec iov[NUM_BUFFERS];
60 ctx->buffers = malloc(NUM_BUFFERS * BUFFER_SIZE);
61 ctx->buffer_size = BUFFER_SIZE;
62 ctx->num_buffers = NUM_BUFFERS;
63
64 for (int i = 0; i < NUM_BUFFERS; i++) {
65 iov[i].iov_base = ctx->buffers + i * BUFFER_SIZE;
66 iov[i].iov_len = BUFFER_SIZE;
67 }
68
69 ret = io_uring_register_buffers(&ctx->ring, iov, NUM_BUFFERS);
70 if (ret) {
71 fprintf(stderr, "io_uring_register_buffers: %s\n", strerror(-ret));
72 return ret;
73 }
74
75 printf("io_uring initialized with %d buffers\n", NUM_BUFFERS);
76
77 return 0;
78}
79
80/**
81 * Setup UDP socket
82 */
83int setup_socket(int port)
84{
85 int sockfd = socket(AF_INET, SOCK_DGRAM, 0);
86 if (sockfd < 0) {
87 perror("socket");
88 return -1;
89 }
90
91 // Set socket options
92 int optval = 1;
93 setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval));
94
95 // Increase receive buffer
96 int rcvbuf = 8 * 1024 * 1024; // 8MB
97 setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf));
98
99 struct sockaddr_in addr;
100 memset(&addr, 0, sizeof(addr));
101 addr.sin_family = AF_INET;
102 addr.sin_addr.s_addr = INADDR_ANY;
103 addr.sin_port = htons(port);
104
105 if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
106 perror("bind");
107 close(sockfd);
108 return -1;
109 }
110
111 printf("Socket bound to port %d\n", port);
112
113 return sockfd;
114}
115
116/**
117 * Submit receive request
118 */
119void submit_recv(struct app_context *ctx, int buffer_id)
120{
121 struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
122 if (!sqe) {
123 fprintf(stderr, "Could not get SQE\n");
124 return;
125 }
126
127 // Prepare recvmsg with registered buffer
128 io_uring_prep_recv(sqe, ctx->sockfd,
129 ctx->buffers + buffer_id * BUFFER_SIZE,
130 BUFFER_SIZE, 0);
131
132 // Set buffer ID
133 sqe->flags |= IOSQE_BUFFER_SELECT;
134 sqe->buf_group = 0;
135
136 // User data: buffer ID + timestamp
137 io_uring_sqe_set_data(sqe, (void *)(uintptr_t)buffer_id);
138
139 io_uring_submit(&ctx->ring);
140}
141
142/**
143 * Process received packet
144 */
145void process_packet(struct app_context *ctx, void *data, int len, uint64_t start_ns)
146{
147 uint64_t end_ns = get_time_ns();
148 uint64_t latency_ns = end_ns - start_ns;
149
150 ctx->packets_received++;
151 ctx->total_latency_ns += latency_ns;
152
153 if (latency_ns < ctx->min_latency_ns || ctx->min_latency_ns == 0)
154 ctx->min_latency_ns = latency_ns;
155 if (latency_ns > ctx->max_latency_ns)
156 ctx->max_latency_ns = latency_ns;
157
158 // Process data (simplified)
159 if (ctx->packets_received % 100000 == 0) {
160 printf("Received %lu packets, latency: %lu ns\n",
161 ctx->packets_received, latency_ns);
162 }
163}
164
165/**
166 * Main event loop
167 */
168void event_loop(struct app_context *ctx)
169{
170 struct io_uring_cqe *cqe;
171
172 // Submit initial receive requests
173 for (int i = 0; i < NUM_BUFFERS / 2; i++) {
174 submit_recv(ctx, i);
175 }
176
177 while (1) {
178 // Wait for completion
179 int ret = io_uring_wait_cqe(&ctx->ring, &cqe);
180 if (ret < 0) {
181 fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
182 break;
183 }
184
185 // Get timestamp immediately
186 uint64_t arrival_time = get_time_ns();
187
188 // Process completion
189 if (cqe->res < 0) {
190 fprintf(stderr, "Async recv failed: %s\n", strerror(-cqe->res));
191 } else if (cqe->res > 0) {
192 // Get buffer ID
193 int buffer_id = (int)(uintptr_t)io_uring_cqe_get_data(cqe);
194 void *buffer = ctx->buffers + buffer_id * BUFFER_SIZE;
195
196 // Process packet
197 process_packet(ctx, buffer, cqe->res, arrival_time);
198
199 // Resubmit receive
200 submit_recv(ctx, buffer_id);
201 }
202
203 io_uring_cqe_seen(&ctx->ring, cqe);
204 }
205}
206
207int main(int argc, char *argv[])
208{
209 struct app_context ctx = {0};
210
211 int port = 12345;
212 if (argc > 1)
213 port = atoi(argv[1]);
214
215 // Setup socket
216 ctx.sockfd = setup_socket(port);
217 if (ctx.sockfd < 0)
218 return 1;
219
220 // Initialize io_uring
221 if (init_io_uring(&ctx) < 0) {
222 close(ctx.sockfd);
223 return 1;
224 }
225
226 printf("Starting event loop...\n");
227
228 // Run event loop
229 event_loop(&ctx);
230
231 // Cleanup
232 io_uring_queue_exit(&ctx.ring);
233 close(ctx.sockfd);
234 free(ctx.buffers);
235
236 // Print statistics
237 if (ctx.packets_received > 0) {
238 printf("\n=== Statistics ===\n");
239 printf("Packets: %lu\n", ctx.packets_received);
240 printf("Min latency: %lu ns\n", ctx.min_latency_ns);
241 printf("Avg latency: %lu ns\n",
242 ctx.total_latency_ns / ctx.packets_received);
243 printf("Max latency: %lu ns\n", ctx.max_latency_ns);
244 }
245
246 return 0;
247}
248eBPF-based packet processing at driver level.
1// xdp_filter.c
2#include <linux/bpf.h>
3#include <linux/if_ether.h>
4#include <linux/ip.h>
5#include <linux/udp.h>
6#include <bpf/bpf_helpers.h>
7#include <bpf/bpf_endian.h>
8
9#define MARKET_DATA_PORT 12345
10
11// Map to pass packets to userspace
12struct {
13 __uint(type, BPF_MAP_TYPE_XSKMAP);
14 __uint(max_entries, 64);
15 __uint(key_size, sizeof(int));
16 __uint(value_size, sizeof(int));
17} xsks_map SEC(".maps");
18
19// Statistics map
20struct {
21 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
22 __uint(max_entries, 1);
23 __type(key, __u32);
24 __type(value, __u64);
25} stats_map SEC(".maps");
26
27/**
28 * XDP program: filter market data packets
29 */
30SEC("xdp_sock")
31int xdp_market_data_filter(struct xdp_md *ctx)
32{
33 void *data_end = (void *)(long)ctx->data_end;
34 void *data = (void *)(long)ctx->data;
35
36 // Parse Ethernet header
37 struct ethhdr *eth = data;
38 if ((void *)(eth + 1) > data_end)
39 return XDP_DROP;
40
41 // Only IPv4
42 if (eth->h_proto != bpf_htons(ETH_P_IP))
43 return XDP_PASS;
44
45 // Parse IP header
46 struct iphdr *ip = (struct iphdr *)(eth + 1);
47 if ((void *)(ip + 1) > data_end)
48 return XDP_DROP;
49
50 // Only UDP
51 if (ip->protocol != IPPROTO_UDP)
52 return XDP_PASS;
53
54 // Parse UDP header
55 struct udphdr *udp = (struct udphdr *)((__u8 *)ip + (ip->ihl * 4));
56 if ((void *)(udp + 1) > data_end)
57 return XDP_DROP;
58
59 // Check destination port
60 if (bpf_ntohs(udp->dest) != MARKET_DATA_PORT)
61 return XDP_PASS;
62
63 // Update statistics
64 __u32 key = 0;
65 __u64 *count = bpf_map_lookup_elem(&stats_map, &key);
66 if (count)
67 __sync_fetch_and_add(count, 1);
68
69 // Redirect to AF_XDP socket
70 return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, 0);
71}
72
73char _license[] SEC("license") = "GPL";
741// xdp_userspace.c
2#include <linux/if_link.h>
3#include <linux/if_xdp.h>
4#include <bpf/libbpf.h>
5#include <bpf/xsk.h>
6#include <net/if.h>
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <unistd.h>
11
12#define NUM_FRAMES 4096
13#define FRAME_SIZE 2048
14#define BATCH_SIZE 64
15
16struct xdp_app {
17 struct xsk_socket_info {
18 struct xsk_ring_cons rx;
19 struct xsk_ring_prod tx;
20 struct xsk_umem_info {
21 struct xsk_ring_prod fq;
22 struct xsk_ring_cons cq;
23 struct xsk_umem *umem;
24 void *buffer;
25 } umem;
26 struct xsk_socket *xsk;
27 int queue_id;
28 } xsk_info;
29
30 uint64_t packets_received;
31 uint64_t total_latency_ns;
32};
33
34/**
35 * Configure XDP socket
36 */
37int configure_xsk(struct xdp_app *app, const char *ifname, int queue_id)
38{
39 struct xsk_socket_config cfg;
40 struct xsk_umem_config umem_cfg = {
41 .fill_size = NUM_FRAMES,
42 .comp_size = NUM_FRAMES,
43 .frame_size = FRAME_SIZE,
44 .frame_headroom = 0,
45 .flags = 0
46 };
47
48 int ret;
49
50 // Allocate memory for UMEM
51 ret = posix_memalign(&app->xsk_info.umem.buffer,
52 getpagesize(),
53 NUM_FRAMES * FRAME_SIZE);
54 if (ret) {
55 fprintf(stderr, "Failed to allocate UMEM\n");
56 return ret;
57 }
58
59 // Create UMEM
60 ret = xsk_umem__create(&app->xsk_info.umem.umem,
61 app->xsk_info.umem.buffer,
62 NUM_FRAMES * FRAME_SIZE,
63 &app->xsk_info.umem.fq,
64 &app->xsk_info.umem.cq,
65 &umem_cfg);
66 if (ret) {
67 fprintf(stderr, "Failed to create UMEM: %d\n", ret);
68 return ret;
69 }
70
71 // Configure socket
72 cfg.rx_size = NUM_FRAMES;
73 cfg.tx_size = NUM_FRAMES;
74 cfg.libbpf_flags = 0;
75 cfg.xdp_flags = XDP_FLAGS_DRV_MODE; // Native XDP mode
76 cfg.bind_flags = XDP_ZEROCOPY; // Zero-copy mode
77
78 // Create XDP socket
79 ret = xsk_socket__create(&app->xsk_info.xsk,
80 ifname,
81 queue_id,
82 app->xsk_info.umem.umem,
83 &app->xsk_info.rx,
84 &app->xsk_info.tx,
85 &cfg);
86 if (ret) {
87 fprintf(stderr, "Failed to create XDP socket: %d\n", ret);
88 return ret;
89 }
90
91 // Populate fill queue
92 uint32_t idx;
93 ret = xsk_ring_prod__reserve(&app->xsk_info.umem.fq, NUM_FRAMES, &idx);
94 if (ret != NUM_FRAMES) {
95 fprintf(stderr, "Failed to reserve fill queue\n");
96 return -1;
97 }
98
99 for (int i = 0; i < NUM_FRAMES; i++)
100 *xsk_ring_prod__fill_addr(&app->xsk_info.umem.fq, idx++) = i * FRAME_SIZE;
101
102 xsk_ring_prod__submit(&app->xsk_info.umem.fq, NUM_FRAMES);
103
104 printf("XDP socket configured on %s queue %d\n", ifname, queue_id);
105
106 return 0;
107}
108
109/**
110 * Process packets from XDP socket
111 */
112void process_packets(struct xdp_app *app)
113{
114 struct xsk_ring_cons *rx = &app->xsk_info.rx;
115 struct xsk_ring_prod *fq = &app->xsk_info.umem.fq;
116 unsigned int rcvd;
117 uint32_t idx_rx = 0, idx_fq = 0;
118
119 rcvd = xsk_ring_cons__peek(rx, BATCH_SIZE, &idx_rx);
120 if (!rcvd)
121 return;
122
123 // Reserve slots in fill queue for recycling
124 while (xsk_ring_prod__reserve(fq, rcvd, &idx_fq) != rcvd) {
125 // Wait for space
126 }
127
128 for (int i = 0; i < rcvd; i++) {
129 uint64_t addr = xsk_ring_cons__rx_desc(rx, idx_rx)->addr;
130 uint32_t len = xsk_ring_cons__rx_desc(rx, idx_rx++)->len;
131
132 // Get packet data
133 uint8_t *pkt = (uint8_t *)xsk_umem__get_data(app->xsk_info.umem.buffer, addr);
134
135 // Process packet (simplified)
136 app->packets_received++;
137
138 if (app->packets_received % 100000 == 0) {
139 printf("Processed %lu packets\n", app->packets_received);
140 }
141
142 // Recycle frame to fill queue
143 *xsk_ring_prod__fill_addr(fq, idx_fq++) = addr;
144 }
145
146 xsk_ring_prod__submit(fq, rcvd);
147 xsk_ring_cons__release(rx, rcvd);
148}
149
150int main(int argc, char *argv[])
151{
152 struct xdp_app app = {0};
153
154 if (argc < 2) {
155 fprintf(stderr, "Usage: %s <interface>\n", argv[0]);
156 return 1;
157 }
158
159 const char *ifname = argv[1];
160 int queue_id = 0;
161
162 // Configure XDP socket
163 if (configure_xsk(&app, ifname, queue_id) < 0)
164 return 1;
165
166 printf("Processing packets...\n");
167
168 // Main loop
169 while (1) {
170 process_packets(&app);
171 }
172
173 return 0;
174}
175Production benchmark results (2.4M packets/sec workload):
1Method P50 P95 P99 P99.9 Max
2------------------------------------------------------
3Socket 18.4µs 42.1µs 68.5µs 124µs 890µs
4io_uring 4.2µs 12.8µs 28.4µs 56µs 245µs
5XDP 2.1µs 5.4µs 12.1µs 24µs 112µs
6DPDK 850ns 2.1µs 4.8µs 9.2µs 48µs
7
8Improvement vs Socket:
9- io_uring: 4.4x faster
10- XDP: 8.8x faster
11- DPDK: 21.6x faster
121Method CPU Usage Context Switches System Calls
2----------------------------------------------------------------
3Socket 45% 2.1M/sec 4.2M/sec
4io_uring 32% 180k/sec 12k/sec
5XDP 28% 0 0
6DPDK 100%* 0 0
7
8*DPDK uses dedicated core with busy polling
91Packets/sec Socket io_uring XDP DPDK
2-------------------------------------------------------
3100k ✓ ✓ ✓ ✓
4500k ✓ ✓ ✓ ✓
51M ✓ ✓ ✓ ✓
62M ⚠️ ✓ ✓ ✓
75M ✗ ⚠️ ✓ ✓
810M ✗ ✗ ⚠️ ✓
9
10✓ = <1% packet loss
11⚠️ = 1-5% packet loss
12✗ = >5% packet loss
13After 18 months in production:
For sub-microsecond latency, kernel bypass is non-negotiable.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.