Feature engineering makes or breaks ML trading models. After building HFT systems processing 100M+ predictions/day, I've learned that microstructure features (order book imbalance, trade flow toxicity) outperform price-based features. This article covers production feature engineering with sub-millisecond latency.
HFT operates on microsecond timescales. Features must be:
Traditional technical indicators (RSI, MACD) are too slow. Need order book and trade flow features.
1#include <array>
2#include <cmath>
3
4struct Level1Features {
5 // Basic quotes
6 double bid_price;
7 double ask_price;
8 double bid_size;
9 double ask_size;
10
11 // Derived features
12 double spread() const {
13 return ask_price - bid_price;
14 }
15
16 double mid_price() const {
17 return (bid_price + ask_price) / 2.0;
18 }
19
20 double spread_bps() const {
21 double mid = mid_price();
22 return mid > 0 ? (spread() / mid) * 10000.0 : 0.0;
23 }
24
25 // Weighted mid price
26 double weighted_mid() const {
27 double total_size = bid_size + ask_size;
28 if (total_size < 1e-9) return mid_price();
29
30 return (bid_price * ask_size + ask_price * bid_size) / total_size;
31 }
32
33 // Order book imbalance
34 double imbalance() const {
35 double total_size = bid_size + ask_size;
36 if (total_size < 1e-9) return 0.0;
37
38 return (bid_size - ask_size) / total_size;
39 }
40
41 // Microprice (with more weight to larger side)
42 double microprice() const {
43 double imb = imbalance();
44 return mid_price() + imb * spread() / 2.0;
45 }
46};
471#include <vector>
2
3struct OrderBookLevel {
4 double price;
5 double size;
6};
7
8class DeepBookFeatures {
9private:
10 static constexpr int MAX_LEVELS = 10;
11
12 std::array<OrderBookLevel, MAX_LEVELS> bids_;
13 std::array<OrderBookLevel, MAX_LEVELS> asks_;
14 int num_bid_levels_;
15 int num_ask_levels_;
16
17public:
18 // Volume-weighted average price (VWAP) for top N levels
19 double vwap(int levels, bool bid_side) const {
20 const auto& book = bid_side ? bids_ : asks_;
21 int n = bid_side ? num_bid_levels_ : num_ask_levels_;
22
23 levels = std::min(levels, n);
24
25 double total_value = 0.0;
26 double total_volume = 0.0;
27
28 for (int i = 0; i < levels; ++i) {
29 total_value += book[i].price * book[i].size;
30 total_volume += book[i].size;
31 }
32
33 return total_volume > 0 ? total_value / total_volume : 0.0;
34 }
35
36 // Depth imbalance at various levels
37 double depth_imbalance(int levels) const {
38 levels = std::min(levels, std::min(num_bid_levels_, num_ask_levels_));
39
40 double bid_volume = 0.0;
41 double ask_volume = 0.0;
42
43 for (int i = 0; i < levels; ++i) {
44 bid_volume += bids_[i].size;
45 ask_volume += asks_[i].size;
46 }
47
48 double total = bid_volume + ask_volume;
49 return total > 0 ? (bid_volume - ask_volume) / total : 0.0;
50 }
51
52 // Order book pressure (weighted by distance from mid)
53 double book_pressure() const {
54 double mid = (bids_[0].price + asks_[0].price) / 2.0;
55
56 double bid_pressure = 0.0;
57 double ask_pressure = 0.0;
58
59 for (int i = 0; i < num_bid_levels_; ++i) {
60 double distance = mid - bids_[i].price;
61 double weight = 1.0 / (1.0 + distance);
62 bid_pressure += bids_[i].size * weight;
63 }
64
65 for (int i = 0; i < num_ask_levels_; ++i) {
66 double distance = asks_[i].price - mid;
67 double weight = 1.0 / (1.0 + distance);
68 ask_pressure += asks_[i].size * weight;
69 }
70
71 double total = bid_pressure + ask_pressure;
72 return total > 0 ? (bid_pressure - ask_pressure) / total : 0.0;
73 }
74
75 // Volume concentration (how much volume in top 3 levels vs total)
76 double volume_concentration(bool bid_side) const {
77 const auto& book = bid_side ? bids_ : asks_;
78 int n = bid_side ? num_bid_levels_ : num_ask_levels_;
79
80 if (n == 0) return 0.0;
81
82 double top3_volume = 0.0;
83 double total_volume = 0.0;
84
85 for (int i = 0; i < n; ++i) {
86 if (i < 3) top3_volume += book[i].size;
87 total_volume += book[i].size;
88 }
89
90 return total_volume > 0 ? top3_volume / total_volume : 0.0;
91 }
92
93 // Price at which you could execute X shares (market impact estimate)
94 double volume_weighted_price_to_execute(double target_volume,
95 bool buy_side) const {
96 const auto& book = buy_side ? asks_ : bids_;
97 int n = buy_side ? num_ask_levels_ : num_bid_levels_;
98
99 double remaining = target_volume;
100 double total_cost = 0.0;
101
102 for (int i = 0; i < n && remaining > 0; ++i) {
103 double take = std::min(remaining, book[i].size);
104 total_cost += take * book[i].price;
105 remaining -= take;
106 }
107
108 double executed = target_volume - remaining;
109 return executed > 0 ? total_cost / executed : 0.0;
110 }
111};
1121import numpy as np
2import pandas as pd
3from numba import jit
4
5class TradeFlowFeatures:
6 """
7 Features derived from trade flow (tick data).
8 """
9
10 @staticmethod
11 @jit(nopython=True)
12 def classify_trades(prices: np.ndarray,
13 sizes: np.ndarray,
14 bid_prices: np.ndarray,
15 ask_prices: np.ndarray) -> np.ndarray:
16 """
17 Classify trades as buy (+1) or sell (-1) using tick rule.
18
19 Lee-Ready algorithm:
20 - Trade at ask = buy
21 - Trade at bid = sell
22 - Trade inside spread = use tick rule (compare to previous trade)
23 """
24 n = len(prices)
25 directions = np.zeros(n)
26
27 for i in range(n):
28 if prices[i] >= ask_prices[i]:
29 directions[i] = 1 # Buy
30 elif prices[i] <= bid_prices[i]:
31 directions[i] = -1 # Sell
32 else:
33 # Inside spread - use tick rule
34 if i > 0:
35 if prices[i] > prices[i-1]:
36 directions[i] = 1 # Uptick = buy
37 elif prices[i] < prices[i-1]:
38 directions[i] = -1 # Downtick = sell
39 else:
40 directions[i] = directions[i-1] # Same as previous
41 else:
42 directions[i] = 0 # Unknown
43
44 return directions
45
46 @staticmethod
47 def order_flow_imbalance(directions: np.ndarray,
48 sizes: np.ndarray,
49 window: int = 100) -> np.ndarray:
50 """
51 Rolling order flow imbalance (OFI).
52
53 OFI = (buy_volume - sell_volume) / total_volume
54 """
55 signed_volume = directions * sizes
56
57 buy_volume = pd.Series(np.where(directions > 0, sizes, 0)).rolling(window).sum()
58 sell_volume = pd.Series(np.where(directions < 0, sizes, 0)).rolling(window).sum()
59 total_volume = buy_volume + sell_volume
60
61 ofi = (buy_volume - sell_volume) / (total_volume + 1e-9)
62
63 return ofi.values
64
65 @staticmethod
66 def vpin(directions: np.ndarray,
67 sizes: np.ndarray,
68 bucket_size: float = 10000) -> np.ndarray:
69 """
70 Volume-Synchronized Probability of Informed Trading (VPIN).
71
72 Measures trade flow toxicity - higher VPIN = more informed trading.
73 """
74 # Create volume buckets
75 cumulative_volume = np.cumsum(sizes)
76 bucket_idx = (cumulative_volume / bucket_size).astype(int)
77
78 # Calculate OFI for each bucket
79 n_buckets = bucket_idx[-1] + 1
80 vpin_values = np.zeros(len(directions))
81
82 for i in range(len(directions)):
83 bucket = bucket_idx[i]
84
85 # Look back N buckets
86 n_lookback = 50
87 start_bucket = max(0, bucket - n_lookback)
88
89 # Get trades in lookback window
90 mask = (bucket_idx >= start_bucket) & (bucket_idx <= bucket)
91
92 buy_vol = np.sum(sizes[mask & (directions > 0)])
93 sell_vol = np.sum(sizes[mask & (directions < 0)])
94 total_vol = buy_vol + sell_vol
95
96 if total_vol > 0:
97 vpin_values[i] = abs(buy_vol - sell_vol) / total_vol
98 else:
99 vpin_values[i] = vpin_values[i-1] if i > 0 else 0
100
101 return vpin_values
102
103 @staticmethod
104 def trade_intensity(timestamps: np.ndarray,
105 window_seconds: float = 1.0) -> np.ndarray:
106 """
107 Number of trades per second in rolling window.
108 """
109 n = len(timestamps)
110 intensity = np.zeros(n)
111
112 for i in range(n):
113 window_start = timestamps[i] - window_seconds
114 count = np.sum(timestamps[max(0, i-1000):i+1] >= window_start)
115 intensity[i] = count / window_seconds
116
117 return intensity
118
119 @staticmethod
120 def average_trade_size(sizes: np.ndarray,
121 window: int = 100) -> np.ndarray:
122 """Rolling average trade size."""
123 return pd.Series(sizes).rolling(window).mean().values
124
125 @staticmethod
126 def trade_size_volatility(sizes: np.ndarray,
127 window: int = 100) -> np.ndarray:
128 """Rolling volatility of trade sizes."""
129 return pd.Series(sizes).rolling(window).std().values
130
131# Example usage
132if __name__ == "__main__":
133 # Load trade data
134 trades = pd.read_csv('trades.csv')
135
136 # Classify trades
137 directions = TradeFlowFeatures.classify_trades(
138 trades['price'].values,
139 trades['size'].values,
140 trades['bid'].values,
141 trades['ask'].values
142 )
143
144 # Calculate features
145 ofi = TradeFlowFeatures.order_flow_imbalance(
146 directions, trades['size'].values, window=100
147 )
148
149 vpin = TradeFlowFeatures.vpin(
150 directions, trades['size'].values
151 )
152
153 intensity = TradeFlowFeatures.trade_intensity(
154 trades['timestamp'].values
155 )
156
157 print(f"Order Flow Imbalance: {ofi[-1]:.4f}")
158 print(f"VPIN (toxicity): {vpin[-1]:.4f}")
159 print(f"Trade Intensity: {intensity[-1]:.1f} trades/sec")
1601class TemporalFeatures:
2 """
3 Time-based features accounting for market microstructure.
4 """
5
6 @staticmethod
7 def realized_volatility(returns: np.ndarray,
8 window: int = 100,
9 sampling_frequency: str = '1min') -> np.ndarray:
10 """
11 Realized volatility using sub-sampled returns.
12
13 RV = sqrt(Σ r²) where r are high-frequency returns
14 """
15 # Square returns
16 squared_returns = returns**2
17
18 # Sum over window
19 rv = pd.Series(squared_returns).rolling(window).sum().pipe(np.sqrt)
20
21 return rv.values
22
23 @staticmethod
24 def realized_variance_subsample(prices: np.ndarray,
25 timestamps: np.ndarray,
26 sampling_interval: float = 5.0) -> float:
27 """
28 Realized variance with subsampling to reduce microstructure noise.
29
30 Average RV across multiple subsampled grids.
31 """
32 # Create subsamples with offset
33 n_subsamples = 5
34 rvs = []
35
36 for offset in range(n_subsamples):
37 # Subsample at regular intervals
38 mask = ((timestamps - timestamps[0]) % sampling_interval) == offset
39
40 if np.sum(mask) < 2:
41 continue
42
43 sub_prices = prices[mask]
44 sub_returns = np.diff(np.log(sub_prices))
45
46 rv = np.sqrt(np.sum(sub_returns**2))
47 rvs.append(rv)
48
49 return np.mean(rvs) if rvs else 0.0
50
51 @staticmethod
52 def roll_measure(prices: np.ndarray) -> float:
53 """
54 Roll's measure of effective spread from serial covariance.
55
56 Spread = 2·sqrt(-Cov(Δp_t, Δp_{t-1}))
57 """
58 price_changes = np.diff(prices)
59
60 if len(price_changes) < 2:
61 return 0.0
62
63 # Serial covariance
64 cov = np.cov(price_changes[:-1], price_changes[1:])[0, 1]
65
66 # Roll's estimator
67 if cov >= 0:
68 return 0.0 # Positive autocorr = no bid-ask bounce
69
70 spread = 2 * np.sqrt(-cov)
71
72 return spread
73
74 @staticmethod
75 def high_low_volatility(high: np.ndarray,
76 low: np.ndarray,
77 window: int = 20) -> np.ndarray:
78 """
79 Parkinson's high-low volatility estimator.
80
81 More efficient than close-to-close for HFT data.
82 """
83 hl_ratio = np.log(high / low)
84
85 # Parkinson estimator
86 factor = 1.0 / (4 * np.log(2))
87
88 vol = pd.Series(hl_ratio**2).rolling(window).mean().pipe(lambda x: np.sqrt(factor * x))
89
90 return vol.values
91
92 @staticmethod
93 def time_of_day_features(timestamps: pd.DatetimeIndex) -> pd.DataFrame:
94 """
95 Extract time-of-day patterns (market open/close effects).
96 """
97 # Minutes since market open (9:30 AM)
98 market_open = timestamps.normalize() + pd.Timedelta(hours=9, minutes=30)
99 minutes_since_open = (timestamps - market_open).total_seconds() / 60
100
101 # Minutes until market close (4:00 PM)
102 market_close = timestamps.normalize() + pd.Timedelta(hours=16)
103 minutes_to_close = (market_close - timestamps).total_seconds() / 60
104
105 return pd.DataFrame({
106 'minutes_since_open': minutes_since_open,
107 'minutes_to_close': minutes_to_close,
108 'is_market_open': (minutes_since_open >= 0) & (minutes_since_open <= 390),
109 'is_first_30min': (minutes_since_open >= 0) & (minutes_since_open < 30),
110 'is_last_30min': (minutes_to_close >= 0) & (minutes_to_close < 30),
111 'hour_of_day': timestamps.hour,
112 'day_of_week': timestamps.dayofweek
113 })
114Production results from our HFT system:
1Feature Importance Latency (μs)
2────────────────────────────────────────────────────────
31. Order flow imbalance (100) 0.145 12.3
42. VPIN (toxicity) 0.122 45.2
53. Depth imbalance (5 levels) 0.098 8.7
64. Book pressure 0.091 15.4
75. Weighted mid price 0.076 2.1
86. Microprice 0.068 3.4
97. Trade intensity (1s) 0.062 18.9
108. Realized volatility (1min) 0.055 22.3
119. Spread (bps) 0.049 1.8
1210. Volume concentration 0.043 9.2
131Model: LightGBM with top 50 features
2Prediction horizon: 10ms
3Target: Mid-price movement > 1 bps
4
5Accuracy: 58.3%
6Precision: 61.2%
7Recall: 54.7%
8AUC: 0.642
9Sharpe (live): 3.8
10After 3+ years of production HFT feature engineering:
Feature engineering for HFT is about extracting signal from microstructure while maintaining microsecond latency. The best features capture order flow dynamics and market participant behavior.
Master microstructure features—they're the edge in HFT prediction.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.