File size: 10,514 Bytes
ed4c418
936bb70
e7ac5fa
 
 
 
 
 
 
 
 
 
 
 
 
dc61da1
 
 
67cde5c
936bb70
198afc2
 
c9670f4
198afc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7ac5fa
198afc2
 
 
 
 
 
 
 
e7ac5fa
198afc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7ac5fa
198afc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc61da1
 
67cde5c
dc61da1
936bb70
dc61da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936bb70
dc61da1
 
 
 
 
 
31613b6
 
 
 
 
dc61da1
 
 
31613b6
dc61da1
 
 
 
198afc2
dc61da1
198afc2
 
 
 
 
 
dc61da1
198afc2
dc61da1
198afc2
dc61da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198afc2
dc61da1
198afc2
 
 
dc61da1
 
198afc2
dc61da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936bb70
dc61da1
 
 
 
 
936bb70
 
9d76542
936bb70
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import shap
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TextClassificationPipeline)
import requests
import re
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
import torch
from deep_translator import DeeplTranslator
import torch
import torch.nn.functional as F
api_key_deepl = "69f73328-5f95-4eda-813a-16af8c688404:fx"
# Buraya İngilizce modelinizi yazın
model = AutoModelForSequenceClassification.from_pretrained("OsBaran/Roberta-Classification-Model")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Cihazı kontrol et
def predict_with_roberta(model, tokenizer, input_text):
    # Giriş metnini tokenize et ve tensor'a çevir
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    # Model ile tahmin yap
    with torch.no_grad():
        outputs = model(**inputs)

    # Logits'leri al ve tahmin yap
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()  # 0: yanlış, 1: doğru
    return prediction
def explain_roberta_prediction(model, tokenizer, input_text):
    # Tokenize et

    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    # Model ile tahmin yap
    with torch.no_grad():
        outputs = model(**inputs)

    # Logits'leri al
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]

    # Tahmin sonucunu ve olasılıkları elde et
    predicted_class = torch.argmax(logits, dim=-1).item()
    result = "Doğru" if predicted_class == 1 else "Yanlış"
    explanation = f"Modelin tahmini: {result} (Olasılık: {probabilities[predicted_class]:.2f})\n"

    # Önemli kelimeleri çıkarma (örnek olarak)
    tokenized_input = tokenizer.tokenize(tokenizer.decode(inputs['input_ids'][0]))
    important_tokens = tokenized_input[:10]  # İlk 10 tokeni al
    explanation += "Modelin kararı aşağıdaki anahtar kelimelere dayanıyor:\n" + ', '.join(important_tokens)

    return explanation

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=device)
def score_and_visualize(text):
  prediction = pipe([text])
  print(prediction[0])

  explainer = shap.Explainer(pipe)
  shap_values = explainer([text])
  shap.plots.text(shap_values)

api_key = '764e3b45715b449a8aedb8cd8018dfed'
def fetch_news_from_api(api_key, query, page_size=100):
    url = f'https://newsapi.org/v2/everything?q={query}&pageSize={page_size}&apiKey={api_key}'
    response = requests.get(url)

    # API yanıtını kontrol et
    if response.status_code == 200:
        articles = response.json().get('articles', [])
        return articles
    else:
        print(f"Error: {response}")
        return []
def extract_keywords(text, top_n=5):
    # 1. Metni temizleme
    text = re.sub(r'[^\w\s]', '', text.lower())  # Noktalama işaretlerini kaldırma ve küçük harfe çevirme

    # 2. Tokenizasyon
    words = text.split()

    # 3. Durak kelimeleri kaldırma
    keywords = [word for word in words if word not in ENGLISH_STOP_WORDS]

    # 4. Anahtar kelimeleri sayma ve en sık geçenleri alma
    keyword_counts = Counter(keywords)
    most_common_keywords = keyword_counts.most_common(top_n)

    return [keyword for keyword, _ in most_common_keywords]



kw_model = KeyBERT('all-mpnet-base-v2')  # SBERT kullanarak modeli yükleyin

def extract_keywords_keybert(text, num_keywords=2):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=num_keywords)
    return [kw[0] for kw in keywords]


def filter_trusted_sources(articles, trusted_sources):
    trusted_articles = []
    for article in articles:
        source_name = article['source']['name'].lower()  # Kaynağı küçük harfe çevir
        if any(trusted_source in source_name for trusted_source in trusted_sources):
            trusted_articles.append(article)
    return trusted_articles

def fetch_news_content(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Başlık ve içerik çıkarma
        title = soup.find('h1').get_text() if soup.find('h1') else "Başlık bulunamadı"
        content = ' '.join([p.get_text() for p in soup.find_all('p')])
        return title, content
    else:
        print(f"Error fetching content: {response.status_code}")
        return "", ""
def compare_with_thrusted(input_text, bbc_articles):
    texts = [input_text] + [article[1] for article in bbc_articles]
    vectorizer = TfidfVectorizer().fit_transform(texts)
    vectors = vectorizer.toarray()
    similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
    return similarities

from sentence_transformers import SentenceTransformer, util
def sbert_similarity(input_text, bbc_articles):
# SBERT modelini yükleyin
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    # Kullanıcı metni ve internetten çekilen metinleri vektörize edin
    input_embedding = model.encode(input_text, convert_to_tensor=True)
    news_embeddings = model.encode([news[1] for news in bbc_articles], convert_to_tensor=True)

    # Benzerlikleri hesaplayın
    cosine_scores = util.pytorch_cos_sim(input_embedding, news_embeddings)

    # En yüksek benzerlik skoru ve karşılık gelen haber
    max_score, most_similar_news = cosine_scores.max(), bbc_articles[cosine_scores.argmax().item()]
    print(f"En benzer haber skoru: {max_score:.2f}")

def translate_text(text, source_lang='tr', target_lang='en'):
    translated = DeeplTranslator(api_key=api_key_deepl, source=source_lang, target=target_lang).translate(text)
    return translated
# Türkçe modelini yükle
# model_tr_name = "dbmdz/bert-base-turkish-cased"  # Buraya Türkçe modelinizi yazın
# model_tr = AutoModelForSequenceClassification.from_pretrained(model_tr_name)
# tokenizer_tr = AutoTokenizer.from_pretrained(model_tr_name)
# classifier_tr = pipeline("sentiment-analysis", model=model_tr, tokenizer=tokenizer_tr)

tokenizer_tr = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model_tr = AutoModelForSequenceClassification.from_pretrained("OsBaran/Bert-Classification-Model-Tr-3", num_labels=2)
def trModelPredictAlgo(input_news):
    inputs = tokenizer(input_news, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

# Modelin tahmin yapması
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Softmax uygulama (olasılık hesaplama)
    probabilities = F.softmax(logits, dim=-1)

    # En yüksek olasılığı ve sınıfı bulma
    predicted_class = torch.argmax(probabilities, dim=-1)
    predicted_probability = probabilities[0, predicted_class].item()
    sonuc = 0
    if(predicted_class.item()==0):
        sonuc = "Yanlış"
    else :
        sonuc = "Doğru"
    # Sonucu yazdırma
    print(f"Predicted class: {predicted_class.item()}")
    print(f"Prediction probability: {predicted_probability * 100:.2f}%")
    return f"Dogruluk tahmini: {sonuc}" + f"Tahmin olasılığı: {predicted_probability * 100:.2f}%"
def enModelPredictAlgo(input_news):
    keywords = extract_keywords_keybert(input_news)
    search_query = ' '.join(keywords)
    news_articles = fetch_news_from_api(api_key, search_query)

    trusted_sources = [
            "bbc news",
            "cnn",
            "reuters.com",
            "theguardian.com",
            "time",
            # Diğer güvenilir kaynaklar...
    ]

    trusted_articles = filter_trusted_sources(news_articles, trusted_sources)
        # # Sonuçları yazdır
    trusted_articles_urls = []
    for i in trusted_articles:
        trusted_articles_urls.append(i["url"])

    if trusted_articles:
        print(f"\nGüvenilir kaynaklardan bulunan haberler:\n")
        print(trusted_articles_urls)
        bbc_articles = [fetch_news_content(link) for link in trusted_articles_urls]
        similarities = compare_with_thrusted(input_news, bbc_articles)
        sbert_similarity(input_news, bbc_articles)
        print(similarities)
        max_similarity = max(similarities)
        threshold = 0.8
        if max_similarity > threshold:
            print(f"Sonuç: Doğru (Benzerlik: {max_similarity:.2f})")
        else:
                # Benzerlik bulunmazsa tahmin algoritmasını kullanın ve açıklama sağlayın
            prediction = predict_with_roberta(model, tokenizer, input_news)
            explanation = explain_roberta_prediction(model, tokenizer, input_news)
            # Tahmin sonucunu yazdır
            # result = "Doğru" if prediction == 1 else "Yanlış"
            # print(f"Haberin durumu: {result}")
            print(explanation)
            return explanation
                
    else:
        print("Güvenilir kaynaklardan hiç haber bulunamadı.")
        prediction = predict_with_roberta(model, tokenizer, input_news)
        explanation = explain_roberta_prediction(model, tokenizer, input_news)
            # Tahmin sonucunu yazdır
        result = "Doğru" if prediction == 1 else "Yanlış"
        print(f"Haberin durumu: {result}")
        print("Haberin açıklaması:")
        print(explanation)
        return explanation
# Gradio ile API oluştur
def predict(input_news, language):
    if language == "en":
        result = enModelPredictAlgo(input_news=input_news)
        return {"Sonuç": result}
    elif language == "tr":
        input_news_en= translate_text(input_news)
        result1 = enModelPredictAlgo(input_news_en)
        
        result2= trModelPredictAlgo(input_news=input_news)
        return {"İngilizce Model Sonucu": result1, "Türkçe Model Sonucu": result2}
    else:
        result = {"error": "Unsupported language"}
    # return result

# Arayüz
gr.Interface(fn=predict, 
             inputs=[gr.Textbox(label="Text Input"), gr.Dropdown(["en", "tr"], label="Language")], 
             outputs="json").launch()