Reinforcement Learning for Market Making: A Practical Guide
Building profitable market making strategies using deep reinforcement learning in real-time trading environments
Market making—continuously quoting bid and ask prices to provide liquidity—is a natural fit for reinforcement learning. The agent must balance inventory risk, adverse selection, and profit opportunity in real-time. Unlike supervised learning, there's no labeled dataset of "correct" quotes; the agent must learn through interaction.
We've built RL-based market making systems that handle billions in notional volume daily. This post shares the architecture, training methodology, and lessons learned.
Traditional market making uses hand-crafted rules or simple models:
| Approach | Strengths | Weaknesses |
|---|---|---|
| Fixed spread | Simple, predictable | Ignores market conditions |
| Avellaneda-Stoikov | Theoretically grounded | Assumes constant parameters |
| Inventory management rules | Easy to understand | Can't adapt to complex patterns |
| RL-based | Learns optimal policy | Requires careful engineering |
RL can discover strategies that humans wouldn't think of, adapting to market microstructure dynamically.
We formulate market making as a Markov Decision Process (MDP):
1from dataclasses import dataclass
2from typing import Tuple, Optional
3import numpy as np
4import torch
5
6@dataclass
7class MarketState:
8 """State representation for market making"""
9
10 # Inventory position
11 inventory: float # Current position in units
12 cash: float # Cash balance
13
14 # Market microstructure
15 bid_price: float
16 ask_price: float
17 mid_price: float
18 spread: float
19
20 # Order book depth
21 bid_depth: np.ndarray # Volume at each price level
22 ask_depth: np.ndarray
23
24 # Market dynamics
25 volatility: float # Recent realized volatility
26 trend: float # Short-term momentum
27 trade_imbalance: float # Buy vs sell pressure
28
29 # Time features
30 time_to_close: float # Remaining trading time
31 time_since_last_fill: float
32
33 # Risk metrics
34 inventory_risk: float # VaR of current position
35 adverse_selection_score: float # Likelihood of informed trading
36
37 def to_tensor(self) -> torch.Tensor:
38 """Convert state to neural network input"""
39 features = [
40 self.inventory / 1000.0, # Normalize
41 self.cash / 1e6,
42 (self.bid_price - self.mid_price) / self.mid_price,
43 (self.ask_price - self.mid_price) / self.mid_price,
44 self.spread / self.mid_price,
45 *self.bid_depth[:5], # Top 5 levels
46 *self.ask_depth[:5],
47 self.volatility,
48 self.trend,
49 self.trade_imbalance,
50 self.time_to_close,
51 self.time_since_last_fill / 60.0, # Normalize to minutes
52 self.inventory_risk,
53 self.adverse_selection_score
54 ]
55 return torch.tensor(features, dtype=torch.float32)
56
57@dataclass
58class MarketAction:
59 """Action space for market making"""
60
61 # Quote parameters
62 bid_offset: float # Offset from mid price (basis points)
63 ask_offset: float
64 bid_size: float # Quote size
65 ask_size: float
66
67 # Cancel existing orders?
68 cancel_bids: bool
69 cancel_asks: bool
70
71 @staticmethod
72 def from_continuous(action_vector: np.ndarray) -> 'MarketAction':
73 """
74 Convert continuous action vector to discrete market action.
75
76 Action vector has 4 continuous values:
77 - bid_offset: [-50, 50] basis points
78 - ask_offset: [-50, 50] basis points
79 - bid_size: [0, max_size]
80 - ask_size: [0, max_size]
81 """
82 return MarketAction(
83 bid_offset=action_vector[0] * 50, # Scale to ±50 bps
84 ask_offset=action_vector[1] * 50,
85 bid_size=max(0, action_vector[2]) * 1000, # Scale to units
86 ask_size=max(0, action_vector[3]) * 1000,
87 cancel_bids=action_vector[0] < -0.5, # Cancel if large change
88 cancel_asks=action_vector[1] > 0.5
89 )
90
91class MarketMakingReward:
92 """
93 Reward function balancing P&L, inventory risk, and other objectives.
94 """
95
96 def __init__(
97 self,
98 inventory_penalty: float = 0.01,
99 adverse_selection_penalty: float = 0.005,
100 spread_capture_bonus: float = 1.0
101 ):
102 self.inventory_penalty = inventory_penalty
103 self.adverse_selection_penalty = adverse_selection_penalty
104 self.spread_capture_bonus = spread_capture_bonus
105
106 def calculate(
107 self,
108 prev_state: MarketState,
109 action: MarketAction,
110 new_state: MarketState,
111 filled_orders: list
112 ) -> float:
113 """
114 Calculate reward for the transition.
115
116 Reward components:
117 1. P&L from filled orders
118 2. Inventory penalty (risk management)
119 3. Adverse selection penalty
120 4. Spread capture bonus
121 """
122 reward = 0.0
123
124 # 1. P&L from executed trades
125 pnl = new_state.cash - prev_state.cash
126 pnl += (new_state.inventory - prev_state.inventory) * new_state.mid_price
127 reward += pnl
128
129 # 2. Inventory risk penalty (quadratic in position)
130 inventory_risk = self.inventory_penalty * (new_state.inventory ** 2)
131 reward -= inventory_risk
132
133 # 3. Adverse selection penalty
134 # Penalize fills that quickly move against us
135 for fill in filled_orders:
136 price_move = abs(new_state.mid_price - fill['price'])
137 if price_move > fill['price'] * 0.0001: # 1bp threshold
138 adverse_selection = self.adverse_selection_penalty * fill['size']
139 reward -= adverse_selection
140
141 # 4. Spread capture bonus
142 # Reward for capturing spread without taking inventory risk
143 if len(filled_orders) >= 2:
144 # Check if we had offsetting fills
145 buy_fills = [f for f in filled_orders if f['side'] == 'buy']
146 sell_fills = [f for f in filled_orders if f['side'] == 'sell']
147
148 if buy_fills and sell_fills:
149 spread_captured = (
150 np.mean([f['price'] for f in sell_fills]) -
151 np.mean([f['price'] for f in buy_fills])
152 )
153 reward += self.spread_capture_bonus * spread_captured
154
155 # 5. Terminal penalty for remaining inventory
156 if new_state.time_to_close < 1.0: # Last minute
157 terminal_penalty = 0.1 * abs(new_state.inventory) * new_state.volatility
158 reward -= terminal_penalty
159
160 return reward
161We use a combination of deep Q-learning and policy gradient methods:
1import torch.nn as nn
2import torch.nn.functional as F
3
4class MarketMakingNetwork(nn.Module):
5 """
6 Actor-Critic network for market making.
7
8 Outputs:
9 - Policy (actor): Continuous actions for quote placement
10 - Value (critic): State value estimate
11 - Auxiliary predictions: Volatility, fill probability, etc.
12 """
13
14 def __init__(
15 self,
16 state_dim: int,
17 action_dim: int,
18 hidden_dim: int = 256,
19 n_layers: int = 3
20 ):
21 super().__init__()
22
23 # Shared feature extraction
24 layers = []
25 in_dim = state_dim
26 for _ in range(n_layers):
27 layers.extend([
28 nn.Linear(in_dim, hidden_dim),
29 nn.LayerNorm(hidden_dim),
30 nn.ReLU(),
31 nn.Dropout(0.1)
32 ])
33 in_dim = hidden_dim
34
35 self.shared_network = nn.Sequential(*layers)
36
37 # Actor head (policy)
38 self.actor_mean = nn.Sequential(
39 nn.Linear(hidden_dim, hidden_dim // 2),
40 nn.ReLU(),
41 nn.Linear(hidden_dim // 2, action_dim),
42 nn.Tanh() # Bound actions to [-1, 1]
43 )
44
45 self.actor_log_std = nn.Parameter(
46 torch.zeros(action_dim)
47 )
48
49 # Critic head (value function)
50 self.critic = nn.Sequential(
51 nn.Linear(hidden_dim, hidden_dim // 2),
52 nn.ReLU(),
53 nn.Linear(hidden_dim // 2, 1)
54 )
55
56 # Auxiliary prediction heads
57 self.volatility_pred = nn.Sequential(
58 nn.Linear(hidden_dim, 1),
59 nn.Softplus()
60 )
61
62 self.fill_probability_pred = nn.Sequential(
63 nn.Linear(hidden_dim, 2), # Bid and ask fill prob
64 nn.Sigmoid()
65 )
66
67 def forward(self, state: torch.Tensor):
68 """
69 Forward pass through network.
70
71 Returns:
72 actions: Action distribution (mean, std)
73 value: State value estimate
74 aux: Auxiliary predictions
75 """
76 # Shared features
77 features = self.shared_network(state)
78
79 # Actor (policy)
80 action_mean = self.actor_mean(features)
81 action_std = torch.exp(self.actor_log_std)
82
83 # Critic (value)
84 value = self.critic(features)
85
86 # Auxiliary predictions
87 volatility = self.volatility_pred(features)
88 fill_probs = self.fill_probability_pred(features)
89
90 return {
91 'action_mean': action_mean,
92 'action_std': action_std,
93 'value': value,
94 'volatility': volatility,
95 'fill_probability': fill_probs
96 }
97
98 def sample_action(self, state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
99 """
100 Sample action from policy.
101
102 Returns:
103 action: Sampled action
104 log_prob: Log probability of action
105 """
106 output = self.forward(state)
107
108 # Create Gaussian distribution
109 dist = torch.distributions.Normal(
110 output['action_mean'],
111 output['action_std']
112 )
113
114 # Sample action
115 action = dist.sample()
116 log_prob = dist.log_prob(action).sum(dim=-1)
117
118 return action, log_prob
119
120
121class PPOTrainer:
122 """
123 Proximal Policy Optimization for market making.
124
125 PPO is stable and sample-efficient, making it suitable for
126 real-world trading where data is expensive.
127 """
128
129 def __init__(
130 self,
131 network: MarketMakingNetwork,
132 learning_rate: float = 3e-4,
133 clip_epsilon: float = 0.2,
134 value_loss_coef: float = 0.5,
135 entropy_coef: float = 0.01,
136 aux_loss_coef: float = 0.1
137 ):
138 self.network = network
139 self.optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
140
141 self.clip_epsilon = clip_epsilon
142 self.value_loss_coef = value_loss_coef
143 self.entropy_coef = entropy_coef
144 self.aux_loss_coef = aux_loss_coef
145
146 def update(self, rollout_buffer):
147 """
148 Update policy using collected rollouts.
149 """
150 # Get rollout data
151 states = rollout_buffer['states']
152 actions = rollout_buffer['actions']
153 old_log_probs = rollout_buffer['log_probs']
154 returns = rollout_buffer['returns']
155 advantages = rollout_buffer['advantages']
156
157 # Auxiliary targets
158 true_volatility = rollout_buffer['volatility']
159 true_fills = rollout_buffer['fills']
160
161 # Multiple epochs over the data
162 for _ in range(10):
163 # Forward pass
164 output = self.network(states)
165
166 # Recompute log probs
167 dist = torch.distributions.Normal(
168 output['action_mean'],
169 output['action_std']
170 )
171 new_log_probs = dist.log_prob(actions).sum(dim=-1)
172 entropy = dist.entropy().sum(dim=-1)
173
174 # PPO clipped objective
175 ratio = torch.exp(new_log_probs - old_log_probs)
176 clipped_ratio = torch.clamp(
177 ratio,
178 1 - self.clip_epsilon,
179 1 + self.clip_epsilon
180 )
181
182 policy_loss = -torch.min(
183 ratio * advantages,
184 clipped_ratio * advantages
185 ).mean()
186
187 # Value loss
188 value_pred = output['value'].squeeze()
189 value_loss = F.mse_loss(value_pred, returns)
190
191 # Entropy bonus (exploration)
192 entropy_loss = -entropy.mean()
193
194 # Auxiliary losses (help with learning)
195 vol_loss = F.mse_loss(output['volatility'].squeeze(), true_volatility)
196 fill_loss = F.binary_cross_entropy(output['fill_probability'], true_fills)
197 aux_loss = vol_loss + fill_loss
198
199 # Combined loss
200 total_loss = (
201 policy_loss +
202 self.value_loss_coef * value_loss +
203 self.entropy_coef * entropy_loss +
204 self.aux_loss_coef * aux_loss
205 )
206
207 # Optimization step
208 self.optimizer.zero_grad()
209 total_loss.backward()
210 torch.nn.utils.clip_grad_norm_(self.network.parameters(), 0.5)
211 self.optimizer.step()
212
213 return {
214 'policy_loss': policy_loss.item(),
215 'value_loss': value_loss.item(),
216 'entropy': entropy.mean().item(),
217 'aux_loss': aux_loss.item()
218 }
219We need a realistic simulation environment that captures market dynamics:
1class MarketSimulator:
2 """
3 Realistic market simulator for training market making agents.
4
5 Models:
6 - Price dynamics (random walk with drift and jumps)
7 - Order book dynamics
8 - Fill probabilities based on queue position
9 - Adverse selection from informed traders
10 """
11
12 def __init__(self, config: dict):
13 self.config = config
14 self.reset()
15
16 def reset(self) -> MarketState:
17 """Reset to initial state"""
18 self.time = 0
19 self.mid_price = 100.0
20 self.inventory = 0.0
21 self.cash = 0.0
22
23 # Initialize order book
24 self.bids = self._initialize_orderbook(side='bid')
25 self.asks = self._initialize_orderbook(side='ask')
26
27 # Market dynamics state
28 self.volatility = 0.0002 # 2 bps per tick
29 self.trend = 0.0
30 self.informed_trader_active = False
31
32 return self._get_state()
33
34 def step(self, action: MarketAction) -> Tuple[MarketState, float, bool]:
35 """
36 Execute one simulation step.
37
38 Returns:
39 next_state: New market state
40 reward: Reward for this transition
41 done: Episode termination flag
42 """
43 prev_state = self._get_state()
44
45 # 1. Submit quotes based on action
46 self._submit_quotes(action)
47
48 # 2. Simulate market evolution
49 self._evolve_market()
50
51 # 3. Process order fills
52 filled_orders = self._process_fills()
53
54 # 4. Update inventory and cash
55 self._update_portfolio(filled_orders)
56
57 # 5. Get new state
58 new_state = self._get_state()
59
60 # 6. Calculate reward
61 reward = self.reward_fn.calculate(
62 prev_state, action, new_state, filled_orders
63 )
64
65 # 7. Check termination
66 self.time += 1
67 done = self.time >= self.config['max_steps']
68
69 return new_state, reward, done
70
71 def _evolve_market(self):
72 """
73 Simulate market price evolution.
74 Combines:
75 - Random walk
76 - Mean reversion
77 - Jump process
78 - Informed trading
79 """
80 dt = 1.0 # Time step
81
82 # Random walk component
83 noise = np.random.randn() * self.volatility * np.sqrt(dt)
84
85 # Mean reversion (stable around 100)
86 mean_reversion = -0.0001 * (self.mid_price - 100.0)
87
88 # Trend component
89 trend = self.trend * dt
90
91 # Jump process (rare events)
92 if np.random.rand() < 0.01: # 1% chance
93 jump = np.random.randn() * self.volatility * 10
94 noise += jump
95
96 # Informed trader impact
97 if self.informed_trader_active:
98 # Informed trader pushes price in their direction
99 informed_impact = np.sign(self.trend) * self.volatility * 5
100 noise += informed_impact
101
102 # Update mid price
103 self.mid_price += noise + mean_reversion + trend
104
105 # Update volatility (GARCH-like)
106 self.volatility = 0.9 * self.volatility + 0.1 * abs(noise)
107
108 # Update trend (AR process)
109 self.trend = 0.95 * self.trend + 0.05 * np.random.randn() * self.volatility
110
111 # Informed trader activation
112 if np.random.rand() < 0.05: # 5% chance
113 self.informed_trader_active = True
114 self.trend = np.random.choice([-1, 1]) * self.volatility * 3
115 else:
116 self.informed_trader_active = False
117
118 def _process_fills(self) -> list:
119 """
120 Determine which orders get filled based on:
121 - Queue position
122 - Order book depth
123 - Recent market activity
124 """
125 filled_orders = []
126
127 # Check bid fills (when market sells)
128 if self.our_bid_quote:
129 fill_prob = self._calculate_fill_probability(
130 self.our_bid_quote,
131 side='bid'
132 )
133
134 if np.random.rand() < fill_prob:
135 fill_size = min(
136 self.our_bid_quote['size'],
137 self._get_market_sell_size()
138 )
139
140 filled_orders.append({
141 'side': 'buy',
142 'price': self.our_bid_quote['price'],
143 'size': fill_size
144 })
145
146 # Check ask fills (when market buys)
147 if self.our_ask_quote:
148 fill_prob = self._calculate_fill_probability(
149 self.our_ask_quote,
150 side='ask'
151 )
152
153 if np.random.rand() < fill_prob:
154 fill_size = min(
155 self.our_ask_quote['size'],
156 self._get_market_buy_size()
157 )
158
159 filled_orders.append({
160 'side': 'sell',
161 'price': self.our_ask_quote['price'],
162 'size': fill_size
163 })
164
165 return filled_orders
166
167 def _calculate_fill_probability(self, quote: dict, side: str) -> float:
168 """
169 Calculate probability of fill based on:
170 - Distance from mid price
171 - Queue position
172 - Market activity
173 """
174 price_distance = abs(quote['price'] - self.mid_price)
175 relative_distance = price_distance / self.mid_price
176
177 # Base probability from distance
178 # Closer quotes have higher fill probability
179 base_prob = np.exp(-100 * relative_distance)
180
181 # Adjust for queue position
182 queue_position = self._get_queue_position(quote, side)
183 queue_factor = 1.0 / (1.0 + queue_position / 1000.0)
184
185 # Adjust for market activity
186 activity_factor = 1.0 + self.volatility * 100
187
188 # Adverse selection: informed traders more likely to hit us
189 if self.informed_trader_active:
190 # If informed trader is buying and we're offering, higher fill prob
191 if (side == 'ask' and self.trend > 0) or \
192 (side == 'bid' and self.trend < 0):
193 adverse_selection_factor = 2.0
194 else:
195 adverse_selection_factor = 0.5
196 else:
197 adverse_selection_factor = 1.0
198
199 fill_prob = base_prob * queue_factor * activity_factor * adverse_selection_factor
200
201 return min(fill_prob, 1.0)
2021class MarketMakingTrainer:
2 """
3 Main training loop for market making agent.
4 """
5
6 def __init__(self, config: dict):
7 self.config = config
8
9 # Initialize components
10 self.env = MarketSimulator(config['env'])
11 self.network = MarketMakingNetwork(
12 state_dim=config['state_dim'],
13 action_dim=config['action_dim']
14 )
15 self.trainer = PPOTrainer(self.network, **config['ppo'])
16
17 # Experience buffer
18 self.buffer = RolloutBuffer(config['buffer_size'])
19
20 def train(self, n_episodes: int):
21 """
22 Train agent for specified number of episodes.
23 """
24 for episode in range(n_episodes):
25 # Collect rollout
26 rollout = self._collect_rollout()
27
28 # Add to buffer
29 self.buffer.add(rollout)
30
31 # Update policy
32 if self.buffer.size() >= self.config['batch_size']:
33 batch = self.buffer.sample(self.config['batch_size'])
34 metrics = self.trainer.update(batch)
35
36 # Log metrics
37 if episode % 10 == 0:
38 self._log_metrics(episode, metrics, rollout)
39
40 def _collect_rollout(self) -> dict:
41 """
42 Collect one episode of experience.
43 """
44 state = self.env.reset()
45
46 states, actions, rewards, values = [], [], [], []
47 log_probs = []
48
49 done = False
50 while not done:
51 # Get action from policy
52 state_tensor = state.to_tensor()
53 action_tensor, log_prob = self.network.sample_action(state_tensor)
54
55 # Convert to market action
56 action = MarketAction.from_continuous(action_tensor.numpy())
57
58 # Execute in environment
59 next_state, reward, done = self.env.step(action)
60
61 # Get value estimate
62 with torch.no_grad():
63 output = self.network(state_tensor)
64 value = output['value'].item()
65
66 # Store experience
67 states.append(state_tensor)
68 actions.append(action_tensor)
69 rewards.append(reward)
70 values.append(value)
71 log_probs.append(log_prob)
72
73 state = next_state
74
75 # Calculate returns and advantages
76 returns = self._calculate_returns(rewards, values)
77 advantages = returns - torch.tensor(values)
78
79 return {
80 'states': torch.stack(states),
81 'actions': torch.stack(actions),
82 'log_probs': torch.stack(log_probs),
83 'returns': returns,
84 'advantages': advantages,
85 'total_reward': sum(rewards)
86 }
871class ProductionMarketMaker:
2 """
3 Production market making agent with safety checks.
4 """
5
6 def __init__(self, model_path: str, config: dict):
7 self.network = self._load_model(model_path)
8 self.config = config
9 self.risk_manager = RiskManager(config)
10
11 # Performance tracking
12 self.performance = PerformanceTracker()
13
14 async def run(self, market_data_stream):
15 """
16 Run market making strategy in production.
17 """
18 async for market_data in market_data_stream:
19 # Construct state
20 state = self._build_state(market_data)
21
22 # Get action from policy
23 with torch.no_grad():
24 action_tensor, _ = self.network.sample_action(state.to_tensor())
25
26 action = MarketAction.from_continuous(action_tensor.numpy())
27
28 # Risk checks
29 if await self.risk_manager.should_execute(action, state):
30 # Submit quotes
31 await self._submit_quotes(action, state)
32
33 # Monitor performance
34 await self.performance.update(state, action)
35
36 # Safety: pause if performance degrades
37 if self.performance.sharpe_ratio_1h < 0:
38 await self._pause_trading("Negative Sharpe ratio")
39RL for market making works, but requires careful engineering. The key is combining powerful learning algorithms with robust risk management and realistic simulation.
Success comes from treating this as a production systems problem, not just an RL research project.
Interested in RL-based trading systems? Contact us to discuss building adaptive market making strategies.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.