News moves markets. This article shows how to extract alpha from news using modern NLP and LLMs:
Key Insight: The edge isn't just reading news faster—it's understanding it better.
Markets are not perfectly efficient. News creates temporary information asymmetry:
Example: Company announces earnings beat. Three types of traders:
The edge: Smart + Fast > Just Fast
News Sources → Ingestion → NLP Processing → Signal Generation → Execution
↓ ↓ ↓ ↓ ↓
RSS feeds Deduplication Sentiment Alpha signals Order routing
APIs Filtering Entities Position sizing Risk checks
Scraping Parsing Events Portfolio mgmt Execution
Latency Budget (for HFT):
Tier 1: Premium feeds (low latency, high cost)
Tier 2: Public APIs (medium latency, low cost)
Tier 3: Web scraping (high latency, free)
1import requests
2import feedparser
3from typing import List, Dict
4from dataclasses import dataclass
5from datetime import datetime
6import hashlib
7
8@dataclass
9class NewsArticle:
10 """Container for news article"""
11 article_id: str
12 title: str
13 content: str
14 source: str
15 published_at: datetime
16 url: str
17 entities: List[str] = None
18 sentiment: float = None
19
20class NewsIngestionEngine:
21 """
22 Multi-source news ingestion with deduplication
23
24 Supports:
25 - RSS feeds
26 - REST APIs
27 - Web scraping
28 """
29
30 def __init__(self):
31 self.seen_articles = set() # For deduplication
32
33 def fetch_from_rss(self, feed_url: str) -> List[NewsArticle]:
34 """
35 Fetch news from RSS feed
36
37 Args:
38 feed_url: RSS feed URL
39
40 Returns:
41 List of NewsArticle objects
42 """
43 feed = feedparser.parse(feed_url)
44 articles = []
45
46 for entry in feed.entries:
47 # Generate unique ID
48 article_id = self._generate_id(entry.title, entry.link)
49
50 # Skip duplicates
51 if article_id in self.seen_articles:
52 continue
53
54 self.seen_articles.add(article_id)
55
56 # Parse published date
57 if hasattr(entry, 'published_parsed'):
58 published_at = datetime(*entry.published_parsed[:6])
59 else:
60 published_at = datetime.now()
61
62 # Extract content
63 content = entry.get('summary', '') or entry.get('description', '')
64
65 article = NewsArticle(
66 article_id=article_id,
67 title=entry.title,
68 content=content,
69 source=feed.feed.get('title', 'Unknown'),
70 published_at=published_at,
71 url=entry.link
72 )
73
74 articles.append(article)
75
76 return articles
77
78 def fetch_from_newsapi(self,
79 api_key: str,
80 query: str = 'stock market',
81 language: str = 'en') -> List[NewsArticle]:
82 """
83 Fetch news from NewsAPI.org
84
85 Args:
86 api_key: NewsAPI.org API key
87 query: Search query
88 language: Language code
89
90 Returns:
91 List of NewsArticle objects
92 """
93 url = 'https://newsapi.org/v2/everything'
94 params = {
95 'q': query,
96 'language': language,
97 'apiKey': api_key,
98 'sortBy': 'publishedAt',
99 'pageSize': 100
100 }
101
102 response = requests.get(url, params=params)
103 data = response.json()
104
105 articles = []
106
107 if data.get('status') == 'ok':
108 for item in data.get('articles', []):
109 article_id = self._generate_id(item['title'], item['url'])
110
111 if article_id in self.seen_articles:
112 continue
113
114 self.seen_articles.add(article_id)
115
116 article = NewsArticle(
117 article_id=article_id,
118 title=item['title'],
119 content=item.get('content', '') or item.get('description', ''),
120 source=item['source']['name'],
121 published_at=datetime.fromisoformat(item['publishedAt'].replace('Z', '+00:00')),
122 url=item['url']
123 )
124
125 articles.append(article)
126
127 return articles
128
129 def _generate_id(self, title: str, url: str) -> str:
130 """Generate unique article ID from title and URL"""
131 content = f"{title}|{url}"
132 return hashlib.md5(content.encode()).hexdigest()
133
134
135# Example usage
136if __name__ == "__main__":
137 ingestion = NewsIngestionEngine()
138
139 # Fetch from RSS (example: Reuters business news)
140 rss_articles = ingestion.fetch_from_rss(
141 'https://www.reutersagency.com/feed/?taxonomy=best-topics&post_type=best'
142 )
143
144 print(f"Fetched {len(rss_articles)} articles from RSS")
145
146 if rss_articles:
147 print(f"\nSample article:")
148 print(f"Title: {rss_articles[0].title}")
149 print(f"Source: {rss_articles[0].source}")
150 print(f"Published: {rss_articles[0].published_at}")
151Traditional sentiment analysis (VADER, TextBlob) fails on financial news:
Example: "The company beat earnings but missed revenue guidance"
Solution: Use FinBERT - BERT fine-tuned on financial text.
1from transformers import AutoTokenizer, AutoModelForSequenceClassification
2import torch
3import numpy as np
4
5class FinancialSentimentAnalyzer:
6 """
7 Financial sentiment analysis using FinBERT
8
9 FinBERT is BERT fine-tuned on financial news and analyst reports
10 """
11
12 def __init__(self, model_name: str = 'ProsusAI/finbert'):
13 """
14 Args:
15 model_name: HuggingFace model name
16 - 'ProsusAI/finbert': General financial sentiment
17 - 'yiyanghkust/finbert-tone': Tone analysis
18 """
19 self.tokenizer = AutoTokenizer.from_pretrained(model_name)
20 self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
21 self.model.eval()
22
23 # Label mapping
24 self.labels = ['negative', 'neutral', 'positive']
25
26 def analyze(self, text: str) -> Dict[str, float]:
27 """
28 Analyze sentiment of text
29
30 Args:
31 text: Input text (headline or article)
32
33 Returns:
34 Dict with sentiment scores and label
35 """
36 # Tokenize
37 inputs = self.tokenizer(
38 text,
39 return_tensors='pt',
40 truncation=True,
41 max_length=512,
42 padding=True
43 )
44
45 # Inference
46 with torch.no_grad():
47 outputs = self.model(**inputs)
48 logits = outputs.logits
49 probs = torch.softmax(logits, dim=1).squeeze().numpy()
50
51 # Get predicted label
52 predicted_idx = np.argmax(probs)
53 predicted_label = self.labels[predicted_idx]
54
55 # Calculate sentiment score (-1 to +1)
56 sentiment_score = probs[2] - probs[0] # positive - negative
57
58 return {
59 'label': predicted_label,
60 'score': float(sentiment_score),
61 'probabilities': {
62 'negative': float(probs[0]),
63 'neutral': float(probs[1]),
64 'positive': float(probs[2])
65 },
66 'confidence': float(probs[predicted_idx])
67 }
68
69 def analyze_batch(self, texts: List[str]) -> List[Dict]:
70 """Analyze multiple texts efficiently"""
71 results = []
72
73 # Batch tokenization
74 inputs = self.tokenizer(
75 texts,
76 return_tensors='pt',
77 truncation=True,
78 max_length=512,
79 padding=True
80 )
81
82 # Batch inference
83 with torch.no_grad():
84 outputs = self.model(**inputs)
85 logits = outputs.logits
86 probs = torch.softmax(logits, dim=1).numpy()
87
88 # Process results
89 for i, prob in enumerate(probs):
90 predicted_idx = np.argmax(prob)
91 predicted_label = self.labels[predicted_idx]
92 sentiment_score = prob[2] - prob[0]
93
94 results.append({
95 'text': texts[i],
96 'label': predicted_label,
97 'score': float(sentiment_score),
98 'probabilities': {
99 'negative': float(prob[0]),
100 'neutral': float(prob[1]),
101 'positive': float(prob[2])
102 },
103 'confidence': float(prob[predicted_idx])
104 })
105
106 return results
107
108
109# Example usage
110if __name__ == "__main__":
111 analyzer = FinancialSentimentAnalyzer()
112
113 # Test headlines
114 headlines = [
115 "Apple beats earnings expectations, stock surges",
116 "Tesla misses revenue guidance, shares tumble",
117 "Fed raises interest rates by 25 basis points as expected",
118 "Amazon announces massive layoffs amid cost-cutting measures",
119 "Microsoft acquires gaming company for $69 billion"
120 ]
121
122 print("Financial Sentiment Analysis:")
123 print("=" * 80)
124
125 results = analyzer.analyze_batch(headlines)
126
127 for result in results:
128 print(f"\nHeadline: {result['text']}")
129 print(f"Sentiment: {result['label']} (score: {result['score']:.3f}, confidence: {result['confidence']:.3f})")
130News mentions companies, but we need to map them to ticker symbols:
Example: "Apple announces new iPhone"
1import spacy
2from typing import List, Tuple
3import re
4
5class FinancialNER:
6 """
7 Named Entity Recognition for financial news
8
9 Extracts:
10 - Companies (ORG)
11 - People (PERSON)
12 - Locations (GPE)
13 - Monetary values (MONEY)
14 - Dates (DATE)
15 """
16
17 def __init__(self):
18 # Load spaCy model
19 self.nlp = spacy.load('en_core_web_sm')
20
21 # Company ticker mapping (in production, use comprehensive database)
22 self.ticker_map = {
23 'apple': 'AAPL',
24 'microsoft': 'MSFT',
25 'amazon': 'AMZN',
26 'google': 'GOOGL',
27 'alphabet': 'GOOGL',
28 'tesla': 'TSLA',
29 'meta': 'META',
30 'facebook': 'META',
31 'nvidia': 'NVDA',
32 'netflix': 'NFLX',
33 'jpmorgan': 'JPM',
34 'goldman sachs': 'GS',
35 'morgan stanley': 'MS',
36 }
37
38 def extract_entities(self, text: str) -> Dict[str, List[str]]:
39 """
40 Extract named entities from text
41
42 Args:
43 text: Input text
44
45 Returns:
46 Dict mapping entity type to list of entities
47 """
48 doc = self.nlp(text)
49
50 entities = {
51 'companies': [],
52 'people': [],
53 'locations': [],
54 'money': [],
55 'dates': []
56 }
57
58 for ent in doc.ents:
59 if ent.label_ == 'ORG':
60 entities['companies'].append(ent.text)
61 elif ent.label_ == 'PERSON':
62 entities['people'].append(ent.text)
63 elif ent.label_ == 'GPE':
64 entities['locations'].append(ent.text)
65 elif ent.label_ == 'MONEY':
66 entities['money'].append(ent.text)
67 elif ent.label_ == 'DATE':
68 entities['dates'].append(ent.text)
69
70 return entities
71
72 def map_to_tickers(self, companies: List[str]) -> List[str]:
73 """
74 Map company names to ticker symbols
75
76 Args:
77 companies: List of company names
78
79 Returns:
80 List of ticker symbols
81 """
82 tickers = []
83
84 for company in companies:
85 company_lower = company.lower()
86
87 # Direct match
88 if company_lower in self.ticker_map:
89 tickers.append(self.ticker_map[company_lower])
90 else:
91 # Fuzzy match (simplified)
92 for key, ticker in self.ticker_map.items():
93 if key in company_lower or company_lower in key:
94 tickers.append(ticker)
95 break
96
97 return list(set(tickers)) # Remove duplicates
98
99 def extract_tickers_from_text(self, text: str) -> List[str]:
100 """
101 Extract ticker symbols directly from text
102
103 Looks for patterns like $AAPL, AAPL:US, etc.
104 """
105 # Pattern: $TICKER or TICKER:EXCHANGE
106 pattern = r'\$([A-Z]{1,5})\b|([A-Z]{1,5}):[A-Z]{2}'
107
108 matches = re.findall(pattern, text)
109
110 tickers = []
111 for match in matches:
112 ticker = match[0] or match[1]
113 if ticker:
114 tickers.append(ticker)
115
116 return list(set(tickers))
117
118 def analyze_article(self, article: NewsArticle) -> Dict:
119 """
120 Complete NER analysis of article
121
122 Args:
123 article: NewsArticle object
124
125 Returns:
126 Dict with entities and tickers
127 """
128 # Combine title and content
129 full_text = f"{article.title}. {article.content}"
130
131 # Extract entities
132 entities = self.extract_entities(full_text)
133
134 # Map companies to tickers
135 tickers_from_companies = self.map_to_tickers(entities['companies'])
136
137 # Extract explicit tickers
138 tickers_from_text = self.extract_tickers_from_text(full_text)
139
140 # Combine
141 all_tickers = list(set(tickers_from_companies + tickers_from_text))
142
143 return {
144 'entities': entities,
145 'tickers': all_tickers,
146 'primary_ticker': all_tickers[0] if all_tickers else None
147 }
148
149
150# Example usage
151if __name__ == "__main__":
152 ner = FinancialNER()
153
154 # Sample article
155 article = NewsArticle(
156 article_id='test123',
157 title='Apple CEO Tim Cook announces record iPhone sales',
158 content='Apple Inc. reported record quarterly revenue of $120 billion, '
159 'driven by strong iPhone 15 sales in China and the United States. '
160 'CEO Tim Cook praised the team for the achievement.',
161 source='Reuters',
162 published_at=datetime.now(),
163 url='https://example.com/article'
164 )
165
166 # Analyze
167 result = ner.analyze_article(article)
168
169 print("NER Analysis:")
170 print("=" * 60)
171 print(f"Companies: {result['entities']['companies']}")
172 print(f"People: {result['entities']['people']}")
173 print(f"Locations: {result['entities']['locations']}")
174 print(f"Money: {result['entities']['money']}")
175 print(f"Tickers: {result['tickers']}")
176 print(f"Primary Ticker: {result['primary_ticker']}")
177Not all news is equal. Focus on high-impact events:
Event Types:
1from typing import Optional
2import re
3
4class EventExtractor:
5 """
6 Extract structured events from financial news
7
8 Identifies:
9 - Earnings events
10 - M&A events
11 - Regulatory events
12 - Management changes
13 """
14
15 def __init__(self):
16 # Event patterns (simplified - in production, use ML models)
17 self.patterns = {
18 'earnings_beat': [
19 r'beat.*earnings',
20 r'exceed.*expectations',
21 r'better than expected',
22 r'earnings surprise'
23 ],
24 'earnings_miss': [
25 r'miss.*earnings',
26 r'below.*expectations',
27 r'disappoint',
28 r'earnings shortfall'
29 ],
30 'acquisition': [
31 r'acquire|acquisition',
32 r'buy|buying|bought',
33 r'purchase|purchasing',
34 r'takeover'
35 ],
36 'merger': [
37 r'merge|merger',
38 r'combination',
39 r'join forces'
40 ],
41 'ceo_change': [
42 r'CEO.*resign',
43 r'CEO.*step down',
44 r'new CEO',
45 r'appoint.*CEO'
46 ],
47 'layoffs': [
48 r'layoff|lay off',
49 r'job cuts',
50 r'workforce reduction',
51 r'eliminate.*positions'
52 ],
53 'product_launch': [
54 r'launch|launches',
55 r'introduce|introduces',
56 r'unveil|unveils',
57 r'announce.*product'
58 ],
59 'fda_approval': [
60 r'FDA.*approve',
61 r'regulatory approval',
62 r'clearance.*FDA'
63 ]
64 }
65
66 def extract_events(self, article: NewsArticle) -> List[Dict]:
67 """
68 Extract events from article
69
70 Args:
71 article: NewsArticle object
72
73 Returns:
74 List of detected events
75 """
76 full_text = f"{article.title}. {article.content}".lower()
77
78 events = []
79
80 for event_type, patterns in self.patterns.items():
81 for pattern in patterns:
82 if re.search(pattern, full_text, re.IGNORECASE):
83 events.append({
84 'type': event_type,
85 'confidence': 0.8, # Simplified - use ML for real confidence
86 'article_id': article.article_id,
87 'timestamp': article.published_at
88 })
89 break # One match per event type
90
91 return events
92
93 def extract_earnings_numbers(self, text: str) -> Optional[Dict]:
94 """
95 Extract actual earnings numbers from text
96
97 Example: "EPS of $2.50 vs. $2.30 expected"
98 """
99 # Pattern for EPS
100 eps_pattern = r'EPS.*?\$?([\d.]+).*?(?:vs|versus|expected).*?\$?([\d.]+)'
101
102 match = re.search(eps_pattern, text, re.IGNORECASE)
103
104 if match:
105 actual = float(match.group(1))
106 expected = float(match.group(2))
107
108 return {
109 'metric': 'EPS',
110 'actual': actual,
111 'expected': expected,
112 'surprise': actual - expected,
113 'surprise_pct': ((actual - expected) / expected * 100) if expected != 0 else 0
114 }
115
116 return None
117
118 def extract_deal_value(self, text: str) -> Optional[float]:
119 """
120 Extract M&A deal value
121
122 Example: "acquired for $10 billion"
123 """
124 # Pattern for deal value
125 pattern = r'\$\s*([\d.]+)\s*(billion|million|trillion)'
126
127 match = re.search(pattern, text, re.IGNORECASE)
128
129 if match:
130 value = float(match.group(1))
131 unit = match.group(2).lower()
132
133 multiplier = {
134 'million': 1e6,
135 'billion': 1e9,
136 'trillion': 1e12
137 }
138
139 return value * multiplier[unit]
140
141 return None
142
143
144# Example usage
145if __name__ == "__main__":
146 extractor = EventExtractor()
147
148 # Sample articles
149 articles = [
150 NewsArticle(
151 article_id='1',
152 title='Apple beats earnings expectations with strong iPhone sales',
153 content='Apple reported EPS of $2.50 vs. $2.30 expected, driven by iPhone 15 demand.',
154 source='Reuters',
155 published_at=datetime.now(),
156 url='https://example.com/1'
157 ),
158 NewsArticle(
159 article_id='2',
160 title='Microsoft acquires gaming company for $69 billion',
161 content='Microsoft announced the acquisition of Activision Blizzard for $69 billion.',
162 source='Bloomberg',
163 published_at=datetime.now(),
164 url='https://example.com/2'
165 ),
166 NewsArticle(
167 article_id='3',
168 title='Tesla CEO Elon Musk steps down',
169 content='Tesla announced that CEO Elon Musk will step down effective immediately.',
170 source='CNBC',
171 published_at=datetime.now(),
172 url='https://example.com/3'
173 )
174 ]
175
176 print("Event Extraction:")
177 print("=" * 80)
178
179 for article in articles:
180 events = extractor.extract_events(article)
181
182 print(f"\nArticle: {article.title}")
183 print(f"Events detected: {[e['type'] for e in events]}")
184
185 # Extract earnings if present
186 earnings = extractor.extract_earnings_numbers(article.content)
187 if earnings:
188 print(f"Earnings: {earnings}")
189
190 # Extract deal value if present
191 deal_value = extractor.extract_deal_value(article.content)
192 if deal_value:
193 print(f"Deal value: ${deal_value:,.0f}")
1941class NewsSignalGenerator:
2 """
3 Generate trading signals from news analysis
4
5 Combines:
6 - Sentiment
7 - Events
8 - Entities
9 - Historical patterns
10 """
11
12 def __init__(self,
13 sentiment_threshold: float = 0.3,
14 confidence_threshold: float = 0.7):
15 self.sentiment_threshold = sentiment_threshold
16 self.confidence_threshold = confidence_threshold
17
18 # Event impact mapping (simplified)
19 self.event_impacts = {
20 'earnings_beat': 0.02, # +2% expected return
21 'earnings_miss': -0.02,
22 'acquisition': 0.05, # Acquirer often drops
23 'merger': 0.03,
24 'ceo_change': -0.01, # Uncertainty
25 'layoffs': -0.015,
26 'product_launch': 0.01,
27 'fda_approval': 0.10 # Biotech stocks
28 }
29
30 def generate_signal(self,
31 article: NewsArticle,
32 sentiment: Dict,
33 entities: Dict,
34 events: List[Dict]) -> Optional[Dict]:
35 """
36 Generate trading signal from news analysis
37
38 Args:
39 article: NewsArticle object
40 sentiment: Sentiment analysis result
41 entities: NER result
42 events: Event extraction result
43
44 Returns:
45 Trading signal dict or None
46 """
47 # Check if we have a ticker
48 if not entities.get('tickers'):
49 return None
50
51 ticker = entities['tickers'][0] # Primary ticker
52
53 # Calculate signal strength
54 signal_strength = 0.0
55 reasons = []
56
57 # 1. Sentiment component
58 if sentiment['confidence'] >= self.confidence_threshold:
59 if sentiment['score'] > self.sentiment_threshold:
60 signal_strength += sentiment['score'] * 0.5
61 reasons.append(f"Positive sentiment ({sentiment['score']:.2f})")
62 elif sentiment['score'] < -self.sentiment_threshold:
63 signal_strength += sentiment['score'] * 0.5
64 reasons.append(f"Negative sentiment ({sentiment['score']:.2f})")
65
66 # 2. Event component
67 for event in events:
68 event_type = event['type']
69 if event_type in self.event_impacts:
70 impact = self.event_impacts[event_type]
71 signal_strength += impact
72 reasons.append(f"{event_type} (impact: {impact:+.2%})")
73
74 # 3. Generate signal
75 if abs(signal_strength) < 0.01:
76 return None # Signal too weak
77
78 direction = 'BUY' if signal_strength > 0 else 'SELL'
79
80 return {
81 'ticker': ticker,
82 'direction': direction,
83 'strength': abs(signal_strength),
84 'expected_return': signal_strength,
85 'reasons': reasons,
86 'article_id': article.article_id,
87 'timestamp': article.published_at,
88 'confidence': sentiment['confidence']
89 }
90
91
92# Example: End-to-end pipeline
93if __name__ == "__main__":
94 # Initialize components
95 sentiment_analyzer = FinancialSentimentAnalyzer()
96 ner = FinancialNER()
97 event_extractor = EventExtractor()
98 signal_generator = NewsSignalGenerator()
99
100 # Sample article
101 article = NewsArticle(
102 article_id='test456',
103 title='Apple beats earnings expectations with strong iPhone sales',
104 content='Apple Inc. reported quarterly EPS of $2.50, beating analyst expectations '
105 'of $2.30. Revenue came in at $120 billion, up 15% year-over-year. '
106 'CEO Tim Cook attributed the strong performance to iPhone 15 demand.',
107 source='Reuters',
108 published_at=datetime.now(),
109 url='https://example.com/article'
110 )
111
112 # Process article
113 sentiment = sentiment_analyzer.analyze(f"{article.title}. {article.content}")
114 entities = ner.analyze_article(article)
115 events = event_extractor.extract_events(article)
116
117 # Generate signal
118 signal = signal_generator.generate_signal(article, sentiment, entities, events)
119
120 print("News Trading Signal:")
121 print("=" * 80)
122 print(f"Article: {article.title}")
123 print(f"\nSentiment: {sentiment['label']} (score: {sentiment['score']:.3f})")
124 print(f"Tickers: {entities['tickers']}")
125 print(f"Events: {[e['type'] for e in events]}")
126
127 if signal:
128 print(f"\nTRADING SIGNAL:")
129 print(f"Ticker: {signal['ticker']}")
130 print(f"Direction: {signal['direction']}")
131 print(f"Strength: {signal['strength']:.2%}")
132 print(f"Expected Return: {signal['expected_return']:+.2%}")
133 print(f"Reasons: {', '.join(signal['reasons'])}")
134 else:
135 print("\nNo signal generated (insufficient strength)")
136News trading with NLP/LLMs is powerful but challenging:
Next Steps:
About the Author: This article is part of NordVarg's series on production-grade algorithmic trading. For related content, see our articles on machine learning for trading and market microstructure.
Technical Writer
NordVarg Team is a software engineer at NordVarg specializing in high-performance financial systems and type-safe programming.
Get weekly insights on building high-performance financial systems, latest industry trends, and expert tips delivered straight to your inbox.