Spaces:
Running
Running
Add application
Browse files
app.py
CHANGED
@@ -2,10 +2,141 @@ import gradio as gr
|
|
2 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
3 |
|
4 |
# Buraya İngilizce modelinizi yazın
|
5 |
-
|
6 |
-
|
7 |
-
classifier_en = pipeline("sentiment-analysis", model=model_en, tokenizer=tokenizer_en)
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Türkçe modelini yükle
|
10 |
model_tr_name = "dbmdz/bert-base-turkish-cased" # Buraya Türkçe modelinizi yazın
|
11 |
model_tr = AutoModelForSequenceClassification.from_pretrained(model_tr_name)
|
@@ -13,9 +144,61 @@ tokenizer_tr = AutoTokenizer.from_pretrained(model_tr_name)
|
|
13 |
classifier_tr = pipeline("sentiment-analysis", model=model_tr, tokenizer=tokenizer_tr)
|
14 |
|
15 |
# Gradio ile API oluştur
|
16 |
-
def predict(
|
17 |
if language == "en":
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
elif language == "tr":
|
20 |
result = classifier_tr(text)
|
21 |
else:
|
|
|
2 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
3 |
|
4 |
# Buraya İngilizce modelinizi yazın
|
5 |
+
model = AutoModelForSequenceClassification.from_pretrained("OsBaran/Roberta-Classification-Model")
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
|
|
7 |
|
8 |
+
def predict_with_roberta(model, tokenizer, input_text):
|
9 |
+
# Giriş metnini tokenize et ve tensor'a çevir
|
10 |
+
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
|
11 |
+
|
12 |
+
# Model ile tahmin yap
|
13 |
+
with torch.no_grad():
|
14 |
+
outputs = model(**inputs)
|
15 |
+
|
16 |
+
# Logits'leri al ve tahmin yap
|
17 |
+
logits = outputs.logits
|
18 |
+
prediction = torch.argmax(logits, dim=-1).item() # 0: yanlış, 1: doğru
|
19 |
+
return prediction
|
20 |
+
def explain_roberta_prediction(model, tokenizer, input_text):
|
21 |
+
# Tokenize et
|
22 |
+
|
23 |
+
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
|
24 |
+
# Model ile tahmin yap
|
25 |
+
with torch.no_grad():
|
26 |
+
outputs = model(**inputs)
|
27 |
+
|
28 |
+
# Logits'leri al
|
29 |
+
logits = outputs.logits
|
30 |
+
probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
|
31 |
+
|
32 |
+
# Tahmin sonucunu ve olasılıkları elde et
|
33 |
+
predicted_class = torch.argmax(logits, dim=-1).item()
|
34 |
+
result = "Doğru" if predicted_class == 1 else "Yanlış"
|
35 |
+
explanation = f"Modelin tahmini: {result} (Olasılık: {probabilities[predicted_class]:.2f})\n"
|
36 |
+
|
37 |
+
# Önemli kelimeleri çıkarma (örnek olarak)
|
38 |
+
tokenized_input = tokenizer.tokenize(tokenizer.decode(inputs['input_ids'][0]))
|
39 |
+
important_tokens = tokenized_input[:10] # İlk 10 tokeni al
|
40 |
+
explanation += "Modelin kararı aşağıdaki anahtar kelimelere dayanıyor:\n" + ', '.join(important_tokens)
|
41 |
+
|
42 |
+
return explanation
|
43 |
+
import shap
|
44 |
+
from transformers import (AutoTokenizer,
|
45 |
+
AutoModelForSequenceClassification,
|
46 |
+
TextClassificationPipeline)
|
47 |
+
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=device)
|
48 |
+
def score_and_visualize(text):
|
49 |
+
prediction = pipe([text])
|
50 |
+
print(prediction[0])
|
51 |
+
|
52 |
+
explainer = shap.Explainer(pipe)
|
53 |
+
shap_values = explainer([text])
|
54 |
+
shap.plots.text(shap_values)
|
55 |
+
import requests
|
56 |
+
import re
|
57 |
+
from collections import Counter
|
58 |
+
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
|
59 |
+
from bs4 import BeautifulSoup
|
60 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
61 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
62 |
+
api_key = '764e3b45715b449a8aedb8cd8018dfed'
|
63 |
+
def fetch_news_from_api(api_key, query, page_size=100):
|
64 |
+
url = f'https://newsapi.org/v2/everything?q={query}&pageSize={page_size}&apiKey={api_key}'
|
65 |
+
response = requests.get(url)
|
66 |
+
|
67 |
+
# API yanıtını kontrol et
|
68 |
+
if response.status_code == 200:
|
69 |
+
articles = response.json().get('articles', [])
|
70 |
+
return articles
|
71 |
+
else:
|
72 |
+
print(f"Error: {response}")
|
73 |
+
return []
|
74 |
+
def extract_keywords(text, top_n=5):
|
75 |
+
# 1. Metni temizleme
|
76 |
+
text = re.sub(r'[^\w\s]', '', text.lower()) # Noktalama işaretlerini kaldırma ve küçük harfe çevirme
|
77 |
+
|
78 |
+
# 2. Tokenizasyon
|
79 |
+
words = text.split()
|
80 |
+
|
81 |
+
# 3. Durak kelimeleri kaldırma
|
82 |
+
keywords = [word for word in words if word not in ENGLISH_STOP_WORDS]
|
83 |
+
|
84 |
+
# 4. Anahtar kelimeleri sayma ve en sık geçenleri alma
|
85 |
+
keyword_counts = Counter(keywords)
|
86 |
+
most_common_keywords = keyword_counts.most_common(top_n)
|
87 |
+
|
88 |
+
return [keyword for keyword, _ in most_common_keywords]
|
89 |
+
|
90 |
+
from keybert import KeyBERT
|
91 |
+
|
92 |
+
kw_model = KeyBERT('all-mpnet-base-v2') # SBERT kullanarak modeli yükleyin
|
93 |
+
|
94 |
+
def extract_keywords_keybert(text, num_keywords=2):
|
95 |
+
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_keywords)
|
96 |
+
return [kw[0] for kw in keywords]
|
97 |
+
|
98 |
+
|
99 |
+
def filter_trusted_sources(articles, trusted_sources):
|
100 |
+
trusted_articles = []
|
101 |
+
for article in articles:
|
102 |
+
source_name = article['source']['name'].lower() # Kaynağı küçük harfe çevir
|
103 |
+
if any(trusted_source in source_name for trusted_source in trusted_sources):
|
104 |
+
trusted_articles.append(article)
|
105 |
+
return trusted_articles
|
106 |
+
|
107 |
+
def fetch_news_content(link):
|
108 |
+
response = requests.get(link)
|
109 |
+
if response.status_code == 200:
|
110 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
111 |
+
# Başlık ve içerik çıkarma
|
112 |
+
title = soup.find('h1').get_text() if soup.find('h1') else "Başlık bulunamadı"
|
113 |
+
content = ' '.join([p.get_text() for p in soup.find_all('p')])
|
114 |
+
return title, content
|
115 |
+
else:
|
116 |
+
print(f"Error fetching content: {response.status_code}")
|
117 |
+
return "", ""
|
118 |
+
def compare_with_thrusted(input_text, bbc_articles):
|
119 |
+
texts = [input_text] + [article[1] for article in bbc_articles]
|
120 |
+
vectorizer = TfidfVectorizer().fit_transform(texts)
|
121 |
+
vectors = vectorizer.toarray()
|
122 |
+
similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
|
123 |
+
return similarities
|
124 |
+
|
125 |
+
from sentence_transformers import SentenceTransformer, util
|
126 |
+
def sbert_similarity(input_text, bbc_articles):
|
127 |
+
# SBERT modelini yükleyin
|
128 |
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
129 |
+
|
130 |
+
# Kullanıcı metni ve internetten çekilen metinleri vektörize edin
|
131 |
+
input_embedding = model.encode(input_text, convert_to_tensor=True)
|
132 |
+
news_embeddings = model.encode([news[1] for news in bbc_articles], convert_to_tensor=True)
|
133 |
+
|
134 |
+
# Benzerlikleri hesaplayın
|
135 |
+
cosine_scores = util.pytorch_cos_sim(input_embedding, news_embeddings)
|
136 |
+
|
137 |
+
# En yüksek benzerlik skoru ve karşılık gelen haber
|
138 |
+
max_score, most_similar_news = cosine_scores.max(), bbc_articles[cosine_scores.argmax().item()]
|
139 |
+
print(f"En benzer haber skoru: {max_score:.2f}")
|
140 |
# Türkçe modelini yükle
|
141 |
model_tr_name = "dbmdz/bert-base-turkish-cased" # Buraya Türkçe modelinizi yazın
|
142 |
model_tr = AutoModelForSequenceClassification.from_pretrained(model_tr_name)
|
|
|
144 |
classifier_tr = pipeline("sentiment-analysis", model=model_tr, tokenizer=tokenizer_tr)
|
145 |
|
146 |
# Gradio ile API oluştur
|
147 |
+
def predict(input_news, language):
|
148 |
if language == "en":
|
149 |
+
keywords = extract_keywords_keybert(input_news)
|
150 |
+
search_query = ' '.join(keywords)
|
151 |
+
news_articles = fetch_news_from_api(api_key, search_query)
|
152 |
+
|
153 |
+
trusted_sources = [
|
154 |
+
"bbc news",
|
155 |
+
"cnn",
|
156 |
+
"reuters.com",
|
157 |
+
"theguardian.com",
|
158 |
+
"time",
|
159 |
+
# Diğer güvenilir kaynaklar...
|
160 |
+
]
|
161 |
+
|
162 |
+
trusted_articles = filter_trusted_sources(news_articles, trusted_sources)
|
163 |
+
# # Sonuçları yazdır
|
164 |
+
trusted_articles_urls = []
|
165 |
+
for i in trusted_articles:
|
166 |
+
trusted_articles_urls.append(i["url"])
|
167 |
+
|
168 |
+
if trusted_articles:
|
169 |
+
print(f"\nGüvenilir kaynaklardan bulunan haberler:\n")
|
170 |
+
print(trusted_articles_urls)
|
171 |
+
bbc_articles = [fetch_news_content(link) for link in trusted_articles_urls]
|
172 |
+
similarities = compare_with_thrusted(input_news, bbc_articles)
|
173 |
+
sbert_similarity(input_news, bbc_articles)
|
174 |
+
print(similarities)
|
175 |
+
max_similarity = max(similarities)
|
176 |
+
threshold = 0.8
|
177 |
+
if max_similarity > threshold:
|
178 |
+
print(f"Sonuç: Doğru (Benzerlik: {max_similarity:.2f})")
|
179 |
+
else:
|
180 |
+
# Benzerlik bulunmazsa tahmin algoritmasını kullanın ve açıklama sağlayın
|
181 |
+
prediction = predict_with_roberta(model, tokenizer, input_news)
|
182 |
+
explanation = explain_roberta_prediction(model, tokenizer, input_news)
|
183 |
+
# Tahmin sonucunu yazdır
|
184 |
+
# result = "Doğru" if prediction == 1 else "Yanlış"
|
185 |
+
# print(f"Haberin durumu: {result}")
|
186 |
+
return explanation
|
187 |
+
print(explanation)
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
else:
|
193 |
+
print("Güvenilir kaynaklardan hiç haber bulunamadı.")
|
194 |
+
prediction = predict_with_roberta(model, tokenizer, input_news)
|
195 |
+
explanation = explain_roberta_prediction(model, tokenizer, input_news)
|
196 |
+
# Tahmin sonucunu yazdır
|
197 |
+
result = "Doğru" if prediction == 1 else "Yanlış"
|
198 |
+
print(f"Haberin durumu: {result}")
|
199 |
+
print("Haberin açıklaması:")
|
200 |
+
print(explanation)
|
201 |
+
return explanation
|
202 |
elif language == "tr":
|
203 |
result = classifier_tr(text)
|
204 |
else:
|