Time Series Forecasting for Trading: From ARIMA to Transformers
Modern approaches to financial time series forecasting, combining classical methods with deep learning for robust predictions
Financial time series forecasting is one of the most challenging ML problems: non-stationary data, low signal-to-noise ratios, regime changes, and adversarial dynamics (other traders learning and adapting). Yet accurate forecasting, even with slight edge, can be enormously profitable.
We've built forecasting systems handling everything from high-frequency tick data to multi-year macro predictions. This post covers what works, what doesn't, and how to combine classical and modern approaches effectively.
Different time horizons require different approaches:
| Horizon | Primary Drivers | Best Methods | Key Challenges |
|---|---|---|---|
| Microseconds | Market microstructure | Point processes, ML on L2 data | Latency, data quality |
| Seconds-Minutes | Order flow, momentum | LSTM, transformers | Non-stationarity |
| Hours-Days | Technical patterns, news | Ensemble methods | Regime changes |
| Weeks-Months | Fundamentals, sentiment | Hybrid models | Sparse signal |
| Quarters-Years | Macro factors | Economic models + ML | Structural breaks |
Before deep learning, understand the classics:
1import numpy as np
2import pandas as pd
3from statsmodels.tsa.arima.model import ARIMA
4from statsmodels.tsa.statespace.sarimax import SARIMAX
5from arch import arch_model
6import warnings
7warnings.filterwarnings('ignore')
8
9class ClassicalForecaster:
10 """
11 Classical time series methods as baseline and ensemble components.
12 """
13
14 def __init__(self):
15 self.models = {}
16
17 def fit_arima(self, returns: pd.Series, order=(1, 0, 1)):
18 """
19 ARIMA for mean forecasting.
20
21 Despite simplicity, ARIMA components often outperform
22 complex models for short-term forecasting.
23 """
24 model = ARIMA(returns, order=order)
25 fitted = model.fit()
26
27 self.models['arima'] = fitted
28
29 return {
30 'aic': fitted.aic,
31 'bic': fitted.bic,
32 'params': fitted.params
33 }
34
35 def fit_garch(self, returns: pd.Series, p=1, q=1):
36 """
37 GARCH for volatility forecasting.
38
39 Still the gold standard for volatility prediction.
40 Deep learning hasn't convincingly beaten it yet.
41 """
42 # Scale returns to percentage
43 returns_pct = returns * 100
44
45 # Fit GARCH model
46 model = arch_model(
47 returns_pct,
48 vol='Garch',
49 p=p,
50 q=q,
51 dist='t' # Student-t for fat tails
52 )
53
54 fitted = model.fit(disp='off')
55 self.models['garch'] = fitted
56
57 return {
58 'aic': fitted.aic,
59 'bic': fitted.bic,
60 'params': fitted.params
61 }
62
63 def forecast(self, horizon: int = 1):
64 """
65 Generate forecasts from fitted models.
66 """
67 forecasts = {}
68
69 # ARIMA forecast (mean)
70 if 'arima' in self.models:
71 arima_forecast = self.models['arima'].forecast(steps=horizon)
72 forecasts['mean'] = arima_forecast
73
74 # GARCH forecast (volatility)
75 if 'garch' in self.models:
76 garch_forecast = self.models['garch'].forecast(horizon=horizon)
77 forecasts['volatility'] = garch_forecast.variance.values[-1, :] ** 0.5 / 100
78
79 return forecasts
80
81 def rolling_forecast(
82 self,
83 data: pd.Series,
84 window: int = 252,
85 horizon: int = 1
86 ) -> pd.DataFrame:
87 """
88 Walk-forward validation with rolling window.
89 Critical for realistic performance assessment.
90 """
91 predictions = []
92
93 for i in range(window, len(data) - horizon):
94 # Training window
95 train = data.iloc[i-window:i]
96
97 # Fit model
98 self.fit_arima(train)
99 self.fit_garch(train)
100
101 # Forecast
102 forecast = self.forecast(horizon)
103
104 # Actual value
105 actual = data.iloc[i:i+horizon].values
106
107 predictions.append({
108 'timestamp': data.index[i+horizon-1],
109 'predicted_mean': forecast['mean'].values[-1],
110 'predicted_vol': forecast['volatility'][-1],
111 'actual': actual[-1]
112 })
113
114 return pd.DataFrame(predictions)
115Modern architectures capture complex temporal dependencies:
1import torch
2import torch.nn as nn
3
4class FinancialLSTM(nn.Module):
5 """
6 LSTM tailored for financial time series.
7
8 Features:
9 - Multi-scale temporal attention
10 - Volatility prediction head
11 - Uncertainty quantification
12 """
13
14 def __init__(
15 self,
16 input_dim: int,
17 hidden_dim: int = 256,
18 num_layers: int = 3,
19 dropout: float = 0.2,
20 prediction_horizon: int = 1
21 ):
22 super().__init__()
23
24 self.hidden_dim = hidden_dim
25 self.num_layers = num_layers
26 self.prediction_horizon = prediction_horizon
27
28 # Multi-layer LSTM
29 self.lstm = nn.LSTM(
30 input_size=input_dim,
31 hidden_size=hidden_dim,
32 num_layers=num_layers,
33 dropout=dropout if num_layers > 1 else 0,
34 batch_first=True
35 )
36
37 # Attention mechanism
38 self.attention = nn.MultiheadAttention(
39 hidden_dim,
40 num_heads=8,
41 dropout=dropout,
42 batch_first=True
43 )
44
45 # Mean prediction head
46 self.mean_head = nn.Sequential(
47 nn.Linear(hidden_dim, hidden_dim // 2),
48 nn.ReLU(),
49 nn.Dropout(dropout),
50 nn.Linear(hidden_dim // 2, prediction_horizon)
51 )
52
53 # Volatility prediction head
54 self.vol_head = nn.Sequential(
55 nn.Linear(hidden_dim, hidden_dim // 2),
56 nn.ReLU(),
57 nn.Dropout(dropout),
58 nn.Linear(hidden_dim // 2, prediction_horizon),
59 nn.Softplus() # Ensure positive
60 )
61
62 # Uncertainty quantification (aleatoric + epistemic)
63 self.uncertainty_head = nn.Sequential(
64 nn.Linear(hidden_dim, hidden_dim // 2),
65 nn.ReLU(),
66 nn.Linear(hidden_dim // 2, prediction_horizon),
67 nn.Softplus()
68 )
69
70 def forward(self, x, hidden=None):
71 """
72 Args:
73 x: (batch, seq_len, input_dim)
74 hidden: Optional LSTM hidden state
75
76 Returns:
77 Dictionary with predictions and uncertainty
78 """
79 # LSTM encoding
80 lstm_out, hidden = self.lstm(x, hidden)
81
82 # Self-attention over sequence
83 attn_out, attn_weights = self.attention(
84 lstm_out, lstm_out, lstm_out
85 )
86
87 # Combine LSTM and attention
88 combined = lstm_out + attn_out
89
90 # Take last time step
91 last_hidden = combined[:, -1, :]
92
93 # Predictions
94 mean_pred = self.mean_head(last_hidden)
95 vol_pred = self.vol_head(last_hidden)
96 uncertainty = self.uncertainty_head(last_hidden)
97
98 return {
99 'mean': mean_pred,
100 'volatility': vol_pred,
101 'uncertainty': uncertainty,
102 'attention_weights': attn_weights,
103 'hidden': hidden
104 }
105
106class TemporalFusionTransformer(nn.Module):
107 """
108 Transformer for multi-horizon forecasting.
109
110 Based on "Temporal Fusion Transformers for Interpretable
111 Multi-horizon Time Series Forecasting" (Lim et al., 2020).
112
113 Handles:
114 - Multiple input types (static, time-varying known, time-varying unknown)
115 - Variable-length prediction horizons
116 - Interpretable attention patterns
117 """
118
119 def __init__(
120 self,
121 static_dim: int,
122 time_varying_known_dim: int,
123 time_varying_unknown_dim: int,
124 hidden_dim: int = 256,
125 num_heads: int = 8,
126 num_encoder_layers: int = 4,
127 num_decoder_layers: int = 4,
128 dropout: float = 0.1
129 ):
130 super().__init__()
131
132 # Variable selection networks
133 self.static_selection = VariableSelectionNetwork(
134 static_dim, hidden_dim
135 )
136 self.known_selection = VariableSelectionNetwork(
137 time_varying_known_dim, hidden_dim
138 )
139 self.unknown_selection = VariableSelectionNetwork(
140 time_varying_unknown_dim, hidden_dim
141 )
142
143 # GRN for static context
144 self.static_encoder = GatedResidualNetwork(hidden_dim)
145
146 # LSTM encoders for temporal processing
147 self.encoder_lstm = nn.LSTM(
148 hidden_dim, hidden_dim, batch_first=True
149 )
150 self.decoder_lstm = nn.LSTM(
151 hidden_dim, hidden_dim, batch_first=True
152 )
153
154 # Multi-head attention
155 self.self_attention = nn.MultiheadAttention(
156 hidden_dim, num_heads, dropout=dropout, batch_first=True
157 )
158
159 # Gated residual networks
160 self.grn_post_attention = GatedResidualNetwork(hidden_dim)
161
162 # Output layer
163 self.output_layer = nn.Linear(hidden_dim, 1)
164
165 # Quantile prediction layers (for uncertainty)
166 self.quantile_layers = nn.ModuleList([
167 nn.Linear(hidden_dim, 1) for _ in range(7) # 7 quantiles
168 ])
169
170 def forward(
171 self,
172 static_inputs,
173 known_inputs,
174 unknown_inputs,
175 encoder_length: int,
176 decoder_length: int
177 ):
178 """
179 Args:
180 static_inputs: (batch, static_dim)
181 known_inputs: (batch, encoder_length + decoder_length, known_dim)
182 unknown_inputs: (batch, encoder_length, unknown_dim)
183
184 Returns:
185 Predictions with uncertainty quantiles
186 """
187 batch_size = static_inputs.shape[0]
188
189 # Variable selection
190 static_vars = self.static_selection(static_inputs)
191 known_vars = self.known_selection(known_inputs)
192 unknown_vars = self.unknown_selection(unknown_inputs)
193
194 # Static context enrichment
195 static_context = self.static_encoder(static_vars)
196
197 # Encoder: process historical data
198 encoder_inputs = known_vars[:, :encoder_length] + unknown_vars
199 encoder_out, encoder_state = self.encoder_lstm(encoder_inputs)
200
201 # Decoder: process future known inputs
202 decoder_inputs = known_vars[:, encoder_length:]
203 decoder_out, _ = self.decoder_lstm(decoder_inputs, encoder_state)
204
205 # Self-attention
206 attn_out, attn_weights = self.self_attention(
207 decoder_out, encoder_out, encoder_out
208 )
209
210 # Gated residual network
211 grn_out = self.grn_post_attention(attn_out, static_context)
212
213 # Output predictions
214 mean_pred = self.output_layer(grn_out)
215
216 # Quantile predictions for uncertainty
217 quantile_preds = [
218 layer(grn_out) for layer in self.quantile_layers
219 ]
220
221 return {
222 'mean': mean_pred,
223 'quantiles': torch.cat(quantile_preds, dim=-1),
224 'attention_weights': attn_weights
225 }
226
227class VariableSelectionNetwork(nn.Module):
228 """Learn which variables are important for prediction"""
229
230 def __init__(self, input_dim: int, hidden_dim: int):
231 super().__init__()
232
233 # Feature transformation
234 self.feature_transform = nn.Linear(input_dim, hidden_dim)
235
236 # Variable selection weights
237 self.variable_weights = nn.Sequential(
238 nn.Linear(input_dim, hidden_dim),
239 nn.Softmax(dim=-1)
240 )
241
242 def forward(self, x):
243 # Transform features
244 transformed = self.feature_transform(x)
245
246 # Compute variable importance
247 weights = self.variable_weights(x)
248
249 # Apply weighted selection
250 selected = transformed * weights
251
252 return selected
253
254class GatedResidualNetwork(nn.Module):
255 """Gated residual network for flexible feature processing"""
256
257 def __init__(self, hidden_dim: int, dropout: float = 0.1):
258 super().__init__()
259
260 self.fc1 = nn.Linear(hidden_dim, hidden_dim)
261 self.fc2 = nn.Linear(hidden_dim, hidden_dim)
262 self.gate = nn.Linear(hidden_dim, hidden_dim)
263
264 self.layer_norm = nn.LayerNorm(hidden_dim)
265 self.dropout = nn.Dropout(dropout)
266
267 def forward(self, x, context=None):
268 # Add context if provided
269 if context is not None:
270 x = x + context.unsqueeze(1)
271
272 # Residual connection
273 residual = x
274
275 # Feature transformation
276 x = self.fc1(x)
277 x = torch.relu(x)
278 x = self.dropout(x)
279 x = self.fc2(x)
280
281 # Gating mechanism
282 gate = torch.sigmoid(self.gate(residual))
283 x = gate * x + (1 - gate) * residual
284
285 # Layer normalization
286 x = self.layer_norm(x)
287
288 return x
289Financial time series require specialized training:
1class TimeSeriesTrainer:
2 """
3 Training pipeline for financial forecasting models.
4
5 Key features:
6 - Walk-forward validation
7 - Custom loss functions (direction, magnitude, quantile)
8 - Transaction cost awareness
9 """
10
11 def __init__(self, model, config: dict):
12 self.model = model
13 self.config = config
14
15 # Optimizer
16 self.optimizer = torch.optim.AdamW(
17 model.parameters(),
18 lr=config['learning_rate'],
19 weight_decay=config['weight_decay']
20 )
21
22 # Scheduler
23 self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
24 self.optimizer,
25 max_lr=config['learning_rate'],
26 total_steps=config['total_steps']
27 )
28
29 def train_walk_forward(
30 self,
31 data: pd.DataFrame,
32 train_window: int = 252,
33 val_window: int = 63,
34 step: int = 21
35 ):
36 """
37 Walk-forward training and validation.
38 Mimics real-world deployment where model is retrained periodically.
39 """
40 results = []
41
42 for start_idx in range(0, len(data) - train_window - val_window, step):
43 # Split data
44 train_end = start_idx + train_window
45 val_end = train_end + val_window
46
47 train_data = data.iloc[start_idx:train_end]
48 val_data = data.iloc[train_end:val_end]
49
50 # Train on this window
51 print(f"Training on {train_data.index[0]} to {train_data.index[-1]}")
52 train_metrics = self._train_epoch(train_data)
53
54 # Validate
55 val_metrics = self._validate(val_data)
56
57 results.append({
58 'train_period': (train_data.index[0], train_data.index[-1]),
59 'val_period': (val_data.index[0], val_data.index[-1]),
60 'train_metrics': train_metrics,
61 'val_metrics': val_metrics
62 })
63
64 return results
65
66 def _train_epoch(self, data: pd.DataFrame):
67 """Train for one epoch"""
68 self.model.train()
69 total_loss = 0
70
71 # Create batches
72 dataloader = self._create_dataloader(data)
73
74 for batch in dataloader:
75 x, y_mean, y_vol = batch
76
77 # Forward pass
78 output = self.model(x)
79
80 # Multi-component loss
81 loss = self._compute_loss(output, y_mean, y_vol)
82
83 # Backward pass
84 self.optimizer.zero_grad()
85 loss.backward()
86
87 # Gradient clipping
88 torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
89
90 self.optimizer.step()
91 self.scheduler.step()
92
93 total_loss += loss.item()
94
95 return {'loss': total_loss / len(dataloader)}
96
97 def _compute_loss(self, output, y_mean, y_vol):
98 """
99 Multi-objective loss for forecasting.
100
101 Combines:
102 1. Mean prediction accuracy
103 2. Directional accuracy
104 3. Volatility prediction
105 4. Uncertainty calibration
106 """
107 pred_mean = output['mean']
108 pred_vol = output['volatility']
109 pred_uncertainty = output['uncertainty']
110
111 # 1. Mean squared error on returns
112 mse_loss = torch.mean((pred_mean - y_mean) ** 2)
113
114 # 2. Directional accuracy (sign prediction)
115 direction_loss = torch.mean(
116 torch.maximum(
117 torch.zeros_like(pred_mean),
118 -pred_mean * y_mean # Penalize wrong direction
119 )
120 )
121
122 # 3. Volatility loss
123 vol_loss = torch.mean((pred_vol - y_vol) ** 2)
124
125 # 4. Negative log-likelihood (uncertainty calibration)
126 # Assume Gaussian with predicted mean and variance
127 variance = pred_vol ** 2 + pred_uncertainty ** 2
128 nll_loss = torch.mean(
129 0.5 * torch.log(2 * np.pi * variance) +
130 0.5 * (y_mean - pred_mean) ** 2 / variance
131 )
132
133 # Combine losses
134 total_loss = (
135 1.0 * mse_loss +
136 0.5 * direction_loss +
137 0.3 * vol_loss +
138 0.2 * nll_loss
139 )
140
141 return total_loss
142Combine multiple models for robust predictions:
1class ForecastingEnsemble:
2 """
3 Ensemble of classical and deep learning models.
4
5 Combines:
6 - ARIMA/GARCH (classical baseline)
7 - LSTM (deep learning)
8 - Transformer (attention-based)
9 - XGBoost (gradient boosting on features)
10 """
11
12 def __init__(self):
13 self.models = {
14 'classical': ClassicalForecaster(),
15 'lstm': FinancialLSTM(input_dim=10),
16 'transformer': TemporalFusionTransformer(
17 static_dim=5,
18 time_varying_known_dim=10,
19 time_varying_unknown_dim=20
20 ),
21 'xgboost': None # Lazy initialization
22 }
23
24 # Learned ensemble weights
25 self.ensemble_weights = nn.Parameter(
26 torch.ones(len(self.models)) / len(self.models)
27 )
28
29 def predict(self, data):
30 """
31 Generate ensemble forecast.
32 """
33 predictions = {}
34
35 # Get predictions from each model
36 for name, model in self.models.items():
37 pred = self._get_model_prediction(name, model, data)
38 predictions[name] = pred
39
40 # Combine with learned weights
41 weights = torch.softmax(self.ensemble_weights, dim=0)
42
43 ensemble_mean = sum(
44 w * predictions[name]['mean']
45 for w, name in zip(weights, predictions.keys())
46 )
47
48 # Uncertainty from ensemble disagreement
49 ensemble_std = torch.std(torch.stack([
50 predictions[name]['mean'] for name in predictions.keys()
51 ]), dim=0)
52
53 return {
54 'mean': ensemble_mean,
55 'uncertainty': ensemble_std,
56 'individual_predictions': predictions,
57 'weights': weights
58 }
591class ProductionForecaster:
2 """
3 Production-ready forecasting system.
4 """
5
6 def __init__(self, model_registry: dict, config: dict):
7 self.models = model_registry
8 self.config = config
9 self.monitoring = ForecastMonitoring()
10
11 async def generate_forecast(
12 self,
13 symbol: str,
14 horizon: int,
15 market_data: dict
16 ):
17 """
18 Generate production forecast with monitoring.
19 """
20 # Prepare features
21 features = await self._prepare_features(symbol, market_data)
22
23 # Generate prediction
24 with torch.no_grad():
25 forecast = self.models['ensemble'].predict(features)
26
27 # Quality checks
28 if not self._is_forecast_valid(forecast):
29 # Fall back to simple model
30 forecast = self.models['classical'].forecast(horizon)
31
32 # Monitor prediction quality
33 await self.monitoring.log_forecast(symbol, forecast)
34
35 return forecast
36
37 def _is_forecast_valid(self, forecast) -> bool:
38 """Sanity checks on forecast"""
39
40 # Check for NaN
41 if torch.isnan(forecast['mean']).any():
42 return False
43
44 # Check magnitude (no crazy predictions)
45 if torch.abs(forecast['mean']).max() > 0.1: # 10% return
46 return False
47
48 # Check uncertainty is reasonable
49 if forecast['uncertainty'].max() > 0.05: # 5% uncertainty
50 return False
51
52 return True
53Financial forecasting is hard, but achievable with the right combination of classical statistics, modern deep learning, and rigorous validation. The key is treating this as an engineering problem with strict performance requirements.
Need forecasting systems for trading? Get in touch to discuss your needs.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.