|
import torch |
|
import numpy as np |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
from scipy.stats import zscore |
|
|
|
class SentimentAnalyzer: |
|
def __init__(self): |
|
self.models = { |
|
'finbert': AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone"), |
|
'financial_sentiment': AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis") |
|
} |
|
self.tokenizers = { |
|
'finbert': AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone"), |
|
'financial_sentiment': AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis") |
|
} |
|
self.max_length = 512 |
|
|
|
def chunk_text(self, text, tokenizer): |
|
tokens = tokenizer.encode(text, truncation=False) |
|
return [tokens[i:i+self.max_length] for i in range(0, len(tokens), self.max_length)] |
|
|
|
def preprocess_text(self, item): |
|
title = str(item.get('title', '')).strip() |
|
content = str(item.get('content', '')).strip() |
|
text = f"{title} {content}".strip() |
|
return text if text else None |
|
|
|
|
|
def analyze(self, news): |
|
if not news: |
|
return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33} |
|
|
|
sentiment_scores = [] |
|
|
|
for item in news: |
|
if not isinstance(item, dict): |
|
continue |
|
|
|
text = self.preprocess_text(item) |
|
if not text: |
|
continue |
|
|
|
tokenizer = self.tokenizers['financial_sentiment'] |
|
model = self.models['financial_sentiment'] |
|
|
|
tokenized_chunks = self.chunk_text(text, tokenizer) |
|
chunk_scores = [] |
|
|
|
for chunk in tokenized_chunks: |
|
inputs = tokenizer.decode(chunk, skip_special_tokens=True) |
|
inputs = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=self.max_length) |
|
outputs = model(**inputs) |
|
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
chunk_scores.append(probabilities.detach().numpy()[0]) |
|
|
|
if chunk_scores: |
|
sentiment_scores.append(np.mean(chunk_scores, axis=0)) |
|
|
|
if not sentiment_scores: |
|
return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33} |
|
|
|
|
|
filtered_scores = [s for s in sentiment_scores if np.abs(zscore(s)).max() < 2] |
|
avg_sentiment = np.mean(filtered_scores, axis=0) if filtered_scores else np.mean(sentiment_scores, axis=0) |
|
|
|
return {'negative': float(avg_sentiment[0]), 'neutral': float(avg_sentiment[1]), 'positive': float(avg_sentiment[2])} |
|
|