OsBaran commited on
Commit
198afc2
·
1 Parent(s): 409072a

Add application

Browse files
Files changed (1) hide show
  1. app.py +188 -5
app.py CHANGED
@@ -2,10 +2,141 @@ import gradio as gr
2
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
 
4
  # Buraya İngilizce modelinizi yazın
5
- model_en = AutoModelForSequenceClassification.from_pretrained("OsBaran/Roberta-Classification-Model")
6
- tokenizer_en = AutoTokenizer.from_pretrained("roberta-base")
7
- classifier_en = pipeline("sentiment-analysis", model=model_en, tokenizer=tokenizer_en)
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Türkçe modelini yükle
10
  model_tr_name = "dbmdz/bert-base-turkish-cased" # Buraya Türkçe modelinizi yazın
11
  model_tr = AutoModelForSequenceClassification.from_pretrained(model_tr_name)
@@ -13,9 +144,61 @@ tokenizer_tr = AutoTokenizer.from_pretrained(model_tr_name)
13
  classifier_tr = pipeline("sentiment-analysis", model=model_tr, tokenizer=tokenizer_tr)
14
 
15
  # Gradio ile API oluştur
16
- def predict(text, language):
17
  if language == "en":
18
- result = classifier_en(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  elif language == "tr":
20
  result = classifier_tr(text)
21
  else:
 
2
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
3
 
4
  # Buraya İngilizce modelinizi yazın
5
+ model = AutoModelForSequenceClassification.from_pretrained("OsBaran/Roberta-Classification-Model")
6
+ tokenizer = AutoTokenizer.from_pretrained("roberta-base")
 
7
 
8
+ def predict_with_roberta(model, tokenizer, input_text):
9
+ # Giriş metnini tokenize et ve tensor'a çevir
10
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
11
+
12
+ # Model ile tahmin yap
13
+ with torch.no_grad():
14
+ outputs = model(**inputs)
15
+
16
+ # Logits'leri al ve tahmin yap
17
+ logits = outputs.logits
18
+ prediction = torch.argmax(logits, dim=-1).item() # 0: yanlış, 1: doğru
19
+ return prediction
20
+ def explain_roberta_prediction(model, tokenizer, input_text):
21
+ # Tokenize et
22
+
23
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
24
+ # Model ile tahmin yap
25
+ with torch.no_grad():
26
+ outputs = model(**inputs)
27
+
28
+ # Logits'leri al
29
+ logits = outputs.logits
30
+ probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
31
+
32
+ # Tahmin sonucunu ve olasılıkları elde et
33
+ predicted_class = torch.argmax(logits, dim=-1).item()
34
+ result = "Doğru" if predicted_class == 1 else "Yanlış"
35
+ explanation = f"Modelin tahmini: {result} (Olasılık: {probabilities[predicted_class]:.2f})\n"
36
+
37
+ # Önemli kelimeleri çıkarma (örnek olarak)
38
+ tokenized_input = tokenizer.tokenize(tokenizer.decode(inputs['input_ids'][0]))
39
+ important_tokens = tokenized_input[:10] # İlk 10 tokeni al
40
+ explanation += "Modelin kararı aşağıdaki anahtar kelimelere dayanıyor:\n" + ', '.join(important_tokens)
41
+
42
+ return explanation
43
+ import shap
44
+ from transformers import (AutoTokenizer,
45
+ AutoModelForSequenceClassification,
46
+ TextClassificationPipeline)
47
+ pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=device)
48
+ def score_and_visualize(text):
49
+ prediction = pipe([text])
50
+ print(prediction[0])
51
+
52
+ explainer = shap.Explainer(pipe)
53
+ shap_values = explainer([text])
54
+ shap.plots.text(shap_values)
55
+ import requests
56
+ import re
57
+ from collections import Counter
58
+ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
59
+ from bs4 import BeautifulSoup
60
+ from sklearn.feature_extraction.text import TfidfVectorizer
61
+ from sklearn.metrics.pairwise import cosine_similarity
62
+ api_key = '764e3b45715b449a8aedb8cd8018dfed'
63
+ def fetch_news_from_api(api_key, query, page_size=100):
64
+ url = f'https://newsapi.org/v2/everything?q={query}&pageSize={page_size}&apiKey={api_key}'
65
+ response = requests.get(url)
66
+
67
+ # API yanıtını kontrol et
68
+ if response.status_code == 200:
69
+ articles = response.json().get('articles', [])
70
+ return articles
71
+ else:
72
+ print(f"Error: {response}")
73
+ return []
74
+ def extract_keywords(text, top_n=5):
75
+ # 1. Metni temizleme
76
+ text = re.sub(r'[^\w\s]', '', text.lower()) # Noktalama işaretlerini kaldırma ve küçük harfe çevirme
77
+
78
+ # 2. Tokenizasyon
79
+ words = text.split()
80
+
81
+ # 3. Durak kelimeleri kaldırma
82
+ keywords = [word for word in words if word not in ENGLISH_STOP_WORDS]
83
+
84
+ # 4. Anahtar kelimeleri sayma ve en sık geçenleri alma
85
+ keyword_counts = Counter(keywords)
86
+ most_common_keywords = keyword_counts.most_common(top_n)
87
+
88
+ return [keyword for keyword, _ in most_common_keywords]
89
+
90
+ from keybert import KeyBERT
91
+
92
+ kw_model = KeyBERT('all-mpnet-base-v2') # SBERT kullanarak modeli yükleyin
93
+
94
+ def extract_keywords_keybert(text, num_keywords=2):
95
+ keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_keywords)
96
+ return [kw[0] for kw in keywords]
97
+
98
+
99
+ def filter_trusted_sources(articles, trusted_sources):
100
+ trusted_articles = []
101
+ for article in articles:
102
+ source_name = article['source']['name'].lower() # Kaynağı küçük harfe çevir
103
+ if any(trusted_source in source_name for trusted_source in trusted_sources):
104
+ trusted_articles.append(article)
105
+ return trusted_articles
106
+
107
+ def fetch_news_content(link):
108
+ response = requests.get(link)
109
+ if response.status_code == 200:
110
+ soup = BeautifulSoup(response.content, 'html.parser')
111
+ # Başlık ve içerik çıkarma
112
+ title = soup.find('h1').get_text() if soup.find('h1') else "Başlık bulunamadı"
113
+ content = ' '.join([p.get_text() for p in soup.find_all('p')])
114
+ return title, content
115
+ else:
116
+ print(f"Error fetching content: {response.status_code}")
117
+ return "", ""
118
+ def compare_with_thrusted(input_text, bbc_articles):
119
+ texts = [input_text] + [article[1] for article in bbc_articles]
120
+ vectorizer = TfidfVectorizer().fit_transform(texts)
121
+ vectors = vectorizer.toarray()
122
+ similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
123
+ return similarities
124
+
125
+ from sentence_transformers import SentenceTransformer, util
126
+ def sbert_similarity(input_text, bbc_articles):
127
+ # SBERT modelini yükleyin
128
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
129
+
130
+ # Kullanıcı metni ve internetten çekilen metinleri vektörize edin
131
+ input_embedding = model.encode(input_text, convert_to_tensor=True)
132
+ news_embeddings = model.encode([news[1] for news in bbc_articles], convert_to_tensor=True)
133
+
134
+ # Benzerlikleri hesaplayın
135
+ cosine_scores = util.pytorch_cos_sim(input_embedding, news_embeddings)
136
+
137
+ # En yüksek benzerlik skoru ve karşılık gelen haber
138
+ max_score, most_similar_news = cosine_scores.max(), bbc_articles[cosine_scores.argmax().item()]
139
+ print(f"En benzer haber skoru: {max_score:.2f}")
140
  # Türkçe modelini yükle
141
  model_tr_name = "dbmdz/bert-base-turkish-cased" # Buraya Türkçe modelinizi yazın
142
  model_tr = AutoModelForSequenceClassification.from_pretrained(model_tr_name)
 
144
  classifier_tr = pipeline("sentiment-analysis", model=model_tr, tokenizer=tokenizer_tr)
145
 
146
  # Gradio ile API oluştur
147
+ def predict(input_news, language):
148
  if language == "en":
149
+ keywords = extract_keywords_keybert(input_news)
150
+ search_query = ' '.join(keywords)
151
+ news_articles = fetch_news_from_api(api_key, search_query)
152
+
153
+ trusted_sources = [
154
+ "bbc news",
155
+ "cnn",
156
+ "reuters.com",
157
+ "theguardian.com",
158
+ "time",
159
+ # Diğer güvenilir kaynaklar...
160
+ ]
161
+
162
+ trusted_articles = filter_trusted_sources(news_articles, trusted_sources)
163
+ # # Sonuçları yazdır
164
+ trusted_articles_urls = []
165
+ for i in trusted_articles:
166
+ trusted_articles_urls.append(i["url"])
167
+
168
+ if trusted_articles:
169
+ print(f"\nGüvenilir kaynaklardan bulunan haberler:\n")
170
+ print(trusted_articles_urls)
171
+ bbc_articles = [fetch_news_content(link) for link in trusted_articles_urls]
172
+ similarities = compare_with_thrusted(input_news, bbc_articles)
173
+ sbert_similarity(input_news, bbc_articles)
174
+ print(similarities)
175
+ max_similarity = max(similarities)
176
+ threshold = 0.8
177
+ if max_similarity > threshold:
178
+ print(f"Sonuç: Doğru (Benzerlik: {max_similarity:.2f})")
179
+ else:
180
+ # Benzerlik bulunmazsa tahmin algoritmasını kullanın ve açıklama sağlayın
181
+ prediction = predict_with_roberta(model, tokenizer, input_news)
182
+ explanation = explain_roberta_prediction(model, tokenizer, input_news)
183
+ # Tahmin sonucunu yazdır
184
+ # result = "Doğru" if prediction == 1 else "Yanlış"
185
+ # print(f"Haberin durumu: {result}")
186
+ return explanation
187
+ print(explanation)
188
+
189
+
190
+
191
+
192
+ else:
193
+ print("Güvenilir kaynaklardan hiç haber bulunamadı.")
194
+ prediction = predict_with_roberta(model, tokenizer, input_news)
195
+ explanation = explain_roberta_prediction(model, tokenizer, input_news)
196
+ # Tahmin sonucunu yazdır
197
+ result = "Doğru" if prediction == 1 else "Yanlış"
198
+ print(f"Haberin durumu: {result}")
199
+ print("Haberin açıklaması:")
200
+ print(explanation)
201
+ return explanation
202
  elif language == "tr":
203
  result = classifier_tr(text)
204
  else: