Performance bugs are insidious: they don't crash your system, they slowly degrade it. After catching dozens of performance regressions in CI before production, I've learned that automated performance testing is as essential as functional testing. This article shares production performance regression detection.
Production incidents from performance regressions:
All were preventable with automated performance testing.
1import time
2import statistics
3import psutil
4import tracemalloc
5from dataclasses import dataclass, field
6from typing import Callable, List, Optional, Dict, Any
7from contextlib import contextmanager
8import json
9
10@dataclass
11class BenchmarkResult:
12 """Results from a benchmark run."""
13 name: str
14 iterations: int
15
16 # Latency (seconds)
17 min_latency: float
18 max_latency: float
19 mean_latency: float
20 median_latency: float
21 p95_latency: float
22 p99_latency: float
23
24 # Memory (bytes)
25 peak_memory: int
26 memory_leaked: int
27
28 # CPU
29 cpu_percent: float
30
31 # Throughput
32 ops_per_second: float
33
34 @classmethod
35 def from_measurements(cls, name: str, latencies: List[float],
36 peak_memory: int, memory_leaked: int,
37 cpu_percent: float) -> 'BenchmarkResult':
38 """Create result from measurements."""
39 sorted_latencies = sorted(latencies)
40 n = len(sorted_latencies)
41
42 return cls(
43 name=name,
44 iterations=n,
45 min_latency=sorted_latencies[0],
46 max_latency=sorted_latencies[-1],
47 mean_latency=statistics.mean(sorted_latencies),
48 median_latency=statistics.median(sorted_latencies),
49 p95_latency=sorted_latencies[int(n * 0.95)],
50 p99_latency=sorted_latencies[int(n * 0.99)],
51 peak_memory=peak_memory,
52 memory_leaked=memory_leaked,
53 cpu_percent=cpu_percent,
54 ops_per_second=n / sum(sorted_latencies) if sum(sorted_latencies) > 0 else 0
55 )
56
57class PerformanceBenchmark:
58 """Run and track performance benchmarks."""
59
60 def __init__(self, name: str, warmup_iterations: int = 100,
61 benchmark_iterations: int = 1000):
62 self.name = name
63 self.warmup_iterations = warmup_iterations
64 self.benchmark_iterations = benchmark_iterations
65
66 def run(self, func: Callable[[], Any]) -> BenchmarkResult:
67 """
68 Run benchmark with warmup, memory tracking, and CPU monitoring.
69
70 Args:
71 func: Function to benchmark (should be fast, called many times)
72
73 Returns:
74 BenchmarkResult with all metrics
75 """
76 # Warmup
77 for _ in range(self.warmup_iterations):
78 func()
79
80 # Start tracking
81 tracemalloc.start()
82 process = psutil.Process()
83 cpu_start = process.cpu_percent()
84
85 initial_memory = tracemalloc.get_traced_memory()[0]
86
87 # Benchmark
88 latencies = []
89 for _ in range(self.benchmark_iterations):
90 start = time.perf_counter()
91 func()
92 end = time.perf_counter()
93 latencies.append(end - start)
94
95 # Collect metrics
96 peak_memory = tracemalloc.get_traced_memory()[1]
97 final_memory = tracemalloc.get_traced_memory()[0]
98 tracemalloc.stop()
99
100 cpu_end = process.cpu_percent()
101 cpu_percent = (cpu_end + cpu_start) / 2
102
103 memory_leaked = final_memory - initial_memory
104
105 return BenchmarkResult.from_measurements(
106 name=self.name,
107 latencies=latencies,
108 peak_memory=peak_memory,
109 memory_leaked=memory_leaked,
110 cpu_percent=cpu_percent
111 )
112
113class BenchmarkComparator:
114 """Compare benchmark results and detect regressions."""
115
116 def __init__(self, baseline_path: str):
117 self.baseline_path = baseline_path
118 self.baseline = self._load_baseline()
119
120 def _load_baseline(self) -> Dict[str, BenchmarkResult]:
121 """Load baseline results from file."""
122 try:
123 with open(self.baseline_path, 'r') as f:
124 data = json.load(f)
125
126 return {
127 name: BenchmarkResult(**result)
128 for name, result in data.items()
129 }
130 except FileNotFoundError:
131 return {}
132
133 def save_baseline(self, results: Dict[str, BenchmarkResult]):
134 """Save results as new baseline."""
135 data = {
136 name: {
137 k: v for k, v in result.__dict__.items()
138 }
139 for name, result in results.items()
140 }
141
142 with open(self.baseline_path, 'w') as f:
143 json.dump(data, f, indent=2)
144
145 def compare(self, name: str, current: BenchmarkResult,
146 latency_threshold_pct: float = 5.0,
147 memory_threshold_pct: float = 10.0,
148 throughput_threshold_pct: float = 5.0) -> Dict[str, Any]:
149 """
150 Compare current result to baseline.
151
152 Args:
153 name: Benchmark name
154 current: Current benchmark result
155 latency_threshold_pct: % increase allowed before regression
156 memory_threshold_pct: % increase allowed before regression
157 throughput_threshold_pct: % decrease allowed before regression
158
159 Returns:
160 Comparison result with regression flags
161 """
162 if name not in self.baseline:
163 return {
164 'is_regression': False,
165 'message': 'No baseline for comparison',
166 'baseline_exists': False
167 }
168
169 baseline = self.baseline[name]
170
171 # Calculate changes
172 p99_change_pct = (
173 (current.p99_latency - baseline.p99_latency) / baseline.p99_latency * 100
174 )
175 mean_change_pct = (
176 (current.mean_latency - baseline.mean_latency) / baseline.mean_latency * 100
177 )
178 memory_change_pct = (
179 (current.peak_memory - baseline.peak_memory) / baseline.peak_memory * 100
180 ) if baseline.peak_memory > 0 else 0
181 throughput_change_pct = (
182 (current.ops_per_second - baseline.ops_per_second) / baseline.ops_per_second * 100
183 ) if baseline.ops_per_second > 0 else 0
184
185 # Detect regressions
186 regressions = []
187
188 if p99_change_pct > latency_threshold_pct:
189 regressions.append(
190 f"P99 latency increased {p99_change_pct:.1f}% "
191 f"({baseline.p99_latency*1e6:.0f}μs → {current.p99_latency*1e6:.0f}μs)"
192 )
193
194 if mean_change_pct > latency_threshold_pct:
195 regressions.append(
196 f"Mean latency increased {mean_change_pct:.1f}% "
197 f"({baseline.mean_latency*1e6:.0f}μs → {current.mean_latency*1e6:.0f}μs)"
198 )
199
200 if memory_change_pct > memory_threshold_pct:
201 regressions.append(
202 f"Peak memory increased {memory_change_pct:.1f}% "
203 f"({baseline.peak_memory/1024/1024:.1f}MB → {current.peak_memory/1024/1024:.1f}MB)"
204 )
205
206 if current.memory_leaked > 1024 * 1024: # 1MB leak
207 regressions.append(
208 f"Memory leak detected: {current.memory_leaked/1024/1024:.1f}MB"
209 )
210
211 if throughput_change_pct < -throughput_threshold_pct:
212 regressions.append(
213 f"Throughput decreased {abs(throughput_change_pct):.1f}% "
214 f"({baseline.ops_per_second:.0f} → {current.ops_per_second:.0f} ops/sec)"
215 )
216
217 return {
218 'is_regression': len(regressions) > 0,
219 'regressions': regressions,
220 'baseline_exists': True,
221 'metrics': {
222 'p99_latency_us': current.p99_latency * 1e6,
223 'p99_change_pct': p99_change_pct,
224 'mean_latency_us': current.mean_latency * 1e6,
225 'mean_change_pct': mean_change_pct,
226 'peak_memory_mb': current.peak_memory / 1024 / 1024,
227 'memory_change_pct': memory_change_pct,
228 'throughput': current.ops_per_second,
229 'throughput_change_pct': throughput_change_pct
230 }
231 }
2321import numpy as np
2from typing import List, Tuple
3
4class OrderBook:
5 """Simple order book for benchmarking."""
6
7 def __init__(self):
8 self.bids = [] # (price, size)
9 self.asks = []
10
11 def add_order(self, side: str, price: float, size: int):
12 if side == 'buy':
13 self.bids.append((price, size))
14 self.bids.sort(reverse=True, key=lambda x: x[0])
15 else:
16 self.asks.append((price, size))
17 self.asks.sort(key=lambda x: x[0])
18
19 def get_mid_price(self) -> Optional[float]:
20 if not self.bids or not self.asks:
21 return None
22 return (self.bids[0][0] + self.asks[0][0]) / 2
23
24# Benchmark: Order book updates
25def benchmark_orderbook_updates():
26 """Benchmark order book update performance."""
27 book = OrderBook()
28
29 def update():
30 book.add_order('buy', 100.0 + np.random.randn(), 100)
31 book.add_order('sell', 100.0 + np.random.randn(), 100)
32 _ = book.get_mid_price()
33
34 benchmark = PerformanceBenchmark("orderbook_updates", warmup_iterations=1000,
35 benchmark_iterations=10000)
36 return benchmark.run(update)
37
38# Benchmark: VaR calculation
39def benchmark_var_calculation():
40 """Benchmark VaR calculation performance."""
41 returns = np.random.randn(1000) * 0.01
42
43 def calc_var():
44 return -np.percentile(returns, 1)
45
46 benchmark = PerformanceBenchmark("var_calculation", warmup_iterations=100,
47 benchmark_iterations=1000)
48 return benchmark.run(calc_var)
49
50# Benchmark: Trade matching
51def benchmark_trade_matching():
52 """Benchmark trade matching logic."""
53 orders = [
54 {'side': 'buy', 'price': 100 + i * 0.01, 'size': 100}
55 for i in range(100)
56 ]
57
58 def match():
59 matches = []
60 for i in range(len(orders) - 1):
61 if orders[i]['price'] >= orders[i+1]['price']:
62 matches.append((orders[i], orders[i+1]))
63 return matches
64
65 benchmark = PerformanceBenchmark("trade_matching", warmup_iterations=100,
66 benchmark_iterations=1000)
67 return benchmark.run(match)
681#!/usr/bin/env python3
2"""
3Performance regression test runner for CI.
4
5Usage:
6 python performance_ci.py --baseline baseline.json --output results.json
7
8Exit codes:
9 0: All benchmarks pass
10 1: Performance regression detected
11 2: Error running benchmarks
12"""
13
14import argparse
15import sys
16import json
17from typing import Dict
18
19def run_all_benchmarks() -> Dict[str, BenchmarkResult]:
20 """Run all performance benchmarks."""
21 results = {}
22
23 # Order book benchmarks
24 results['orderbook_updates'] = benchmark_orderbook_updates()
25
26 # Risk calculation benchmarks
27 results['var_calculation'] = benchmark_var_calculation()
28
29 # Trade matching benchmarks
30 results['trade_matching'] = benchmark_trade_matching()
31
32 return results
33
34def main():
35 parser = argparse.ArgumentParser(
36 description='Run performance regression tests'
37 )
38 parser.add_argument('--baseline', required=True,
39 help='Path to baseline results JSON')
40 parser.add_argument('--output', required=True,
41 help='Path to save current results')
42 parser.add_argument('--update-baseline', action='store_true',
43 help='Update baseline with current results')
44 parser.add_argument('--latency-threshold', type=float, default=5.0,
45 help='Latency regression threshold (%%)')
46 parser.add_argument('--memory-threshold', type=float, default=10.0,
47 help='Memory regression threshold (%%)')
48
49 args = parser.parse_args()
50
51 try:
52 # Run benchmarks
53 print("Running performance benchmarks...")
54 results = run_all_benchmarks()
55
56 # Save results
57 with open(args.output, 'w') as f:
58 data = {
59 name: result.__dict__
60 for name, result in results.items()
61 }
62 json.dump(data, f, indent=2)
63
64 print(f"Results saved to {args.output}")
65
66 # Compare to baseline
67 comparator = BenchmarkComparator(args.baseline)
68
69 all_regressions = []
70
71 for name, result in results.items():
72 print(f"\n{name}:")
73 print(f" P99 latency: {result.p99_latency*1e6:.0f}μs")
74 print(f" Mean latency: {result.mean_latency*1e6:.0f}μs")
75 print(f" Throughput: {result.ops_per_second:.0f} ops/sec")
76 print(f" Peak memory: {result.peak_memory/1024/1024:.1f}MB")
77
78 comparison = comparator.compare(
79 name, result,
80 latency_threshold_pct=args.latency_threshold,
81 memory_threshold_pct=args.memory_threshold
82 )
83
84 if comparison['is_regression']:
85 print(f" ⚠️ REGRESSION DETECTED:")
86 for regression in comparison['regressions']:
87 print(f" - {regression}")
88 all_regressions.extend(comparison['regressions'])
89 elif comparison['baseline_exists']:
90 print(f" ✓ No regression")
91 else:
92 print(f" ℹ️ No baseline")
93
94 # Update baseline if requested
95 if args.update_baseline:
96 comparator.save_baseline(results)
97 print(f"\nBaseline updated: {args.baseline}")
98
99 # Exit with error if regressions detected
100 if all_regressions:
101 print(f"\n❌ {len(all_regressions)} performance regression(s) detected!")
102 return 1
103 else:
104 print(f"\n✅ All performance tests passed")
105 return 0
106
107 except Exception as e:
108 print(f"Error: {e}", file=sys.stderr)
109 return 2
110
111if __name__ == '__main__':
112 sys.exit(main())
1131# .github/workflows/performance.yml
2name: Performance Tests
3
4on:
5 pull_request:
6 branches: [ main ]
7 schedule:
8 - cron: '0 2 * * *' # Nightly
9
10jobs:
11 performance:
12 runs-on: ubuntu-latest
13
14 steps:
15 - uses: actions/checkout@v3
16
17 - name: Set up Python
18 uses: actions/setup-python@v4
19 with:
20 python-version: '3.11'
21
22 - name: Install dependencies
23 run: |
24 pip install numpy psutil pytest
25
26 - name: Download baseline
27 run: |
28 # Download from previous successful run
29 gh run download --name performance-baseline || echo "No baseline yet"
30 env:
31 GH_TOKEN: ${{ github.token }}
32
33 - name: Run performance tests
34 id: perf
35 run: |
36 python performance_ci.py \
37 --baseline baseline.json \
38 --output results.json \
39 --latency-threshold 5.0 \
40 --memory-threshold 10.0
41 continue-on-error: true
42
43 - name: Upload results
44 uses: actions/upload-artifact@v3
45 with:
46 name: performance-results
47 path: results.json
48
49 - name: Comment PR
50 if: github.event_name == 'pull_request' && steps.perf.outcome == 'failure'
51 uses: actions/github-script@v6
52 with:
53 script: |
54 const fs = require('fs');
55 const results = JSON.parse(fs.readFileSync('results.json'));
56
57 let comment = '## ⚠️ Performance Regression Detected\n\n';
58 comment += '| Benchmark | P99 Latency | Change | Status |\n';
59 comment += '|-----------|-------------|--------|--------|\n';
60
61 for (const [name, result] of Object.entries(results)) {
62 comment += `| ${name} | ${(result.p99_latency*1e6).toFixed(0)}μs | - | ⚠️ |\n`;
63 }
64
65 github.rest.issues.createComment({
66 issue_number: context.issue.number,
67 owner: context.repo.owner,
68 repo: context.repo.repo,
69 body: comment
70 });
71
72 - name: Fail if regression
73 if: steps.perf.outcome == 'failure'
74 run: exit 1
75
76 - name: Update baseline on main
77 if: github.ref == 'refs/heads/main' && steps.perf.outcome == 'success'
78 run: |
79 cp results.json baseline.json
80
81 - name: Upload new baseline
82 if: github.ref == 'refs/heads/main'
83 uses: actions/upload-artifact@v3
84 with:
85 name: performance-baseline
86 path: baseline.json
87Performance regressions caught in CI (2020-2024):
1Year Regressions Caught Escaped to Prod Avg Impact
2────────────────────────────────────────────────────────────────
32020 3 5 +85μs latency
42021 12 2 +120μs latency
52022 18 1 +45μs latency
62023 24 0 N/A
72024 31 0 N/A
81PR #1234: Optimize order validation
2❌ Performance regression detected:
3
4orderbook_updates:
5 - P99 latency increased 12.3% (45μs → 51μs)
6 - Mean latency increased 8.7% (32μs → 35μs)
7
8Root cause: Added database lookup for duplicate check
9Fix: Cache recent order IDs in memory (LRU, 10k entries)
10Result: 28μs P99 (38% faster than original)
11Performance regression testing transforms reliability. We went from regular production performance incidents to zero escapes in 2 years.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.