wide_analysis_space / model_predict.py
hsuvaskakoty's picture
Upload 9 files
0d0a4e0 verified
raw
history blame
15.5 kB
# #using pipeline to predict the input text
# import pandas as pd
# from transformers import pipeline, AutoTokenizer
# import pysbd
# #-----------------Outcome Prediction-----------------
# def outcome(text):
# label_mapping = {
# 'delete': [0, 'LABEL_0'],
# 'keep': [1, 'LABEL_1'],
# 'merge': [2, 'LABEL_2'],
# 'no consensus': [3, 'LABEL_3'],
# 'speedy keep': [4, 'LABEL_4'],
# 'speedy delete': [5, 'LABEL_5'],
# 'redirect': [6, 'LABEL_6'],
# 'withdrawn': [7, 'LABEL_7']
# }
# model_name = "research-dump/roberta-large_deletion_multiclass_complete_final"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = pipeline("text-classification", model=model_name, return_all_scores=True)
# # Tokenize and truncate the text
# tokens = tokenizer(text, truncation=True, max_length=512)
# truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
# results = model(truncated_text)
# res_list = []
# for result in results[0]:
# for key, value in label_mapping.items():
# if result['label'] == value[1]:
# res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']})
# break
# return res_list
# #-----------------Stance Prediction-----------------
# def extract_response(text, model_name, label_mapping):
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None)
# tokens = tokenizer(text, truncation=True, max_length=512)
# truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
# results = pipe(truncated_text)
# final_scores = {key: 0.0 for key in label_mapping}
# for result in results[0]:
# for key, value in label_mapping.items():
# if result['label'] == f'LABEL_{value}':
# final_scores[key] = result['score']
# break
# return final_scores
# def get_stance(text):
# label_mapping = {
# 'delete': 0,
# 'keep': 1,
# 'merge': 2,
# 'comment': 3
# }
# seg = pysbd.Segmenter(language="en", clean=False)
# text_list = seg.segment(text)
# model = 'research-dump/bert-large-uncased_wikistance_v1'
# res_list = []
# for t in text_list:
# res = extract_response(t, model,label_mapping) #, access_token)
# highest_key = max(res, key=res.get)
# highest_score = res[highest_key]
# result = {'sentence':t,'stance': highest_key, 'score': highest_score}
# res_list.append(result)
# return res_list
# #-----------------Policy Prediction-----------------
# def get_policy(text):
# label_mapping = {'Wikipedia:Notability': 0,
# 'Wikipedia:What Wikipedia is not': 1,
# 'Wikipedia:Neutral point of view': 2,
# 'Wikipedia:Verifiability': 3,
# 'Wikipedia:Wikipedia is not a dictionary': 4,
# 'Wikipedia:Wikipedia is not for things made up one day': 5,
# 'Wikipedia:Criteria for speedy deletion': 6,
# 'Wikipedia:Deletion policy': 7,
# 'Wikipedia:No original research': 8,
# 'Wikipedia:Biographies of living persons': 9,
# 'Wikipedia:Arguments to avoid in deletion discussions': 10,
# 'Wikipedia:Conflict of interest': 11,
# 'Wikipedia:Articles for deletion': 12
# }
# seg = pysbd.Segmenter(language="en", clean=False)
# text_list = seg.segment(text)
# model = 'research-dump/bert-large-uncased_wikistance_policy_v1'
# res_list = []
# for t in text_list:
# res = extract_response(t, model,label_mapping)
# highest_key = max(res, key=res.get)
# highest_score = res[highest_key]
# result = {'sentence': t, 'policy': highest_key, 'score': highest_score}
# res_list.append(result)
# return res_list
# #-----------------Sentiment Analysis-----------------
# def extract_highest_score_label(res):
# flat_res = [item for sublist in res for item in sublist]
# highest_score_item = max(flat_res, key=lambda x: x['score'])
# highest_score_label = highest_score_item['label']
# highest_score_value = highest_score_item['score']
# return highest_score_label, highest_score_value
# def get_sentiment(text):
# #sentiment analysis
# model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = pipeline("text-classification", model=model_name, top_k= None)
# #sentence tokenize the text using pysbd
# seg = pysbd.Segmenter(language="en", clean=False)
# text_list = seg.segment(text)
# res = []
# for t in text_list:
# results = model(t)
# highest_label, highest_score = extract_highest_score_label(results)
# result = {'sentence': t,'sentiment': highest_label, 'score': highest_score}
# res.append(result)
# return res
# #-----------------Toxicity Prediction-----------------
# def get_offensive_label(text):
# #offensive language detection model
# model_name = "cardiffnlp/twitter-roberta-base-offensive"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = pipeline("text-classification", model=model_name, top_k= None)
# #sentence tokenize the text using pysbd
# seg = pysbd.Segmenter(language="en", clean=False)
# text_list = seg.segment(text)
# res = []
# for t in text_list:
# results = model(t)
# highest_label, highest_score = extract_highest_score_label(results)
# result = {'sentence': t,'offensive_label': highest_label, 'score': highest_score}
# res.append(result)
# return res
# #create the anchor function
# def predict_text(text, model_name):
# if model_name == 'outcome':
# return outcome(text)
# elif model_name == 'stance':
# return get_stance(text)
# elif model_name == 'policy':
# return get_policy(text)
# elif model_name == 'sentiment':
# return get_sentiment(text)
# elif model_name == 'offensive':
# return get_offensive_label(text)
# else:
# return "Invalid model name"
import pandas as pd
from transformers import pipeline, AutoTokenizer
import pysbd
import torch
label_mapping_wikipedia_en = {
'delete': [0, 'LABEL_0'],
'keep': [1, 'LABEL_1'],
'merge': [2, 'LABEL_2'],
'no consensus': [3, 'LABEL_3'],
'speedy keep': [4, 'LABEL_4'],
'speedy delete': [5, 'LABEL_5'],
'redirect': [6, 'LABEL_6'],
'withdrawn': [7, 'LABEL_7']
}
label_mapping_es = {
'Borrar': [0, 'LABEL_0'],
'Mantener': [1, 'LABEL_1'],
'Fusionar': [2, 'LABEL_2'],
'Otros': [3, 'LABEL_3']
}
label_mapping_gr = {
'Διαγραφή': [0, 'LABEL_0'],
'Δεν υπάρχει συναίνεση': [1, 'LABEL_1'],
'Διατήρηση': [2, 'LABEL_2'],
'συγχώνευση': [3, 'LABEL_3']
}
label_mapping_wikidata_ent = {
'delete': [0, 'LABEL_0'],
'no_consensus': [1, 'LABEL_1'],
'merge': [2, 'LABEL_2'],
'keep': [3, 'LABEL_3'],
'comment': [4, 'LABEL_4'],
'redirect': [5, 'LABEL_5']
}
label_mapping_wikidata_prop = {
'deleted': [0, 'LABEL_0'],
'keep': [1, 'LABEL_1'],
'no_consensus': [2, 'LABEL_2']
}
label_mapping_wikinews = {
'delete': [0, 'LABEL_0'],
'no_consensus': [1, 'LABEL_1'],
'speedy delete': [2, 'LABEL_2'],
'keep': [3, 'LABEL_3'],
'redirect': [4, 'LABEL_4'],
'comment': [5, 'LABEL_5'],
'merge': [6, 'LABEL_6'],
'withdrawn': [7, 'LABEL_7']
}
label_mapping_wikiquote = {
'merge': [0, 'LABEL_0'],
'keep': [1, 'LABEL_1'],
'no_consensus': [2, 'LABEL_2'],
'redirect': [3, 'LABEL_3'],
'delete': [4, 'LABEL_4']
}
best_models_tasks = {
'wikipedia': 'research-dump/roberta-large_deletion_multiclass_complete_final_v2',
'wikidata_entity': 'research-dump/roberta-large_wikidata_ent_outcome_prediction_v1',
'wikidata_property': 'research-dump/roberta-large_wikidata_prop_outcome_prediction_v1',
'wikinews': 'research-dump/all-roberta-large-v1_wikinews_outcome_prediction_v1',
'wikiquote': 'research-dump/roberta-large_wikiquote_outcome_prediction_v1'
}
best_models_langs = {
'en': 'research-dump/roberta-large_deletion_multiclass_complete_final_v2',
'es': 'research-dump/xlm-roberta-large_deletion_multiclass_es',
'gr': 'research-dump/xlm-roberta-large_deletion_multiclass_gr'
}
#-----------------Outcome Prediction-----------------
def outcome(text, lang='en', platform='wikipedia', date='', years=None):
if lang == 'en':
if platform not in best_models_tasks:
raise ValueError(f"For lang='en', platform must be one of {list(best_models_tasks.keys())}")
model_name = best_models_tasks[platform]
if platform == 'wikipedia':
label_mapping = label_mapping_wikipedia_en
elif platform == 'wikidata_entity':
label_mapping = label_mapping_wikidata_ent
elif platform == 'wikidata_property':
label_mapping = label_mapping_wikidata_prop
elif platform == 'wikinews':
label_mapping = label_mapping_wikinews
elif platform == 'wikiquote':
label_mapping = label_mapping_wikiquote
elif lang in ['es', 'gr']:
if platform != 'wikipedia':
raise ValueError(f"For lang='{lang}', only platform='wikipedia' is supported.")
model_name = best_models_langs[lang]
label_mapping = label_mapping_es if lang == 'es' else label_mapping_gr
else:
raise ValueError("Invalid lang. Use 'en', 'es', or 'gr'.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = pipeline("text-classification", model=model_name, return_all_scores=True, device=device)
tokens = tokenizer(text, truncation=True, max_length=512)
truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
results = model(truncated_text)
res_list = []
for result in results[0]:
for key, value in label_mapping.items():
if result['label'] == value[1]:
res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']})
break
return res_list
def extract_response(text, model_name, label_mapping):
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None)
tokens = tokenizer(text, truncation=True, max_length=512)
truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
results = pipe(truncated_text)
final_scores = {key: 0.0 for key in label_mapping}
for result in results[0]:
for key, value in label_mapping.items():
if result['label'] == f'LABEL_{value}':
final_scores[key] = result['score']
break
return final_scores
#-----------------Stance Detection-----------------
def get_stance(text):
label_mapping = {
'delete': 0,
'keep': 1,
'merge': 2,
'comment': 3
}
seg = pysbd.Segmenter(language="en", clean=False)
text_list = seg.segment(text)
model = 'research-dump/bert-large-uncased_wikistance_v1'
res_list = []
for t in text_list:
res = extract_response(t, model,label_mapping) #, access_token)
highest_key = max(res, key=res.get)
highest_score = res[highest_key]
result = {'sentence':t,'stance': highest_key, 'score': highest_score}
res_list.append(result)
return res_list
#-----------------Policy Prediction-----------------
def get_policy(text):
label_mapping = {'Wikipedia:Notability': 0,
'Wikipedia:What Wikipedia is not': 1,
'Wikipedia:Neutral point of view': 2,
'Wikipedia:Verifiability': 3,
'Wikipedia:Wikipedia is not a dictionary': 4,
'Wikipedia:Wikipedia is not for things made up one day': 5,
'Wikipedia:Criteria for speedy deletion': 6,
'Wikipedia:Deletion policy': 7,
'Wikipedia:No original research': 8,
'Wikipedia:Biographies of living persons': 9,
'Wikipedia:Arguments to avoid in deletion discussions': 10,
'Wikipedia:Conflict of interest': 11,
'Wikipedia:Articles for deletion': 12
}
seg = pysbd.Segmenter(language="en", clean=False)
text_list = seg.segment(text)
model = 'research-dump/bert-large-uncased_wikistance_policy_v1'
res_list = []
for t in text_list:
res = extract_response(t, model,label_mapping)
highest_key = max(res, key=res.get)
highest_score = res[highest_key]
result = {'sentence': t, 'policy': highest_key, 'score': highest_score}
res_list.append(result)
return res_list
#-----------------Sentiment Analysis-----------------
def extract_highest_score_label(res):
flat_res = [item for sublist in res for item in sublist]
highest_score_item = max(flat_res, key=lambda x: x['score'])
highest_score_label = highest_score_item['label']
highest_score_value = highest_score_item['score']
return highest_score_label, highest_score_value
def get_sentiment(text):
#sentiment analysis
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = pipeline("text-classification", model=model_name, top_k= None)
#sentence tokenize the text using pysbd
seg = pysbd.Segmenter(language="en", clean=False)
text_list = seg.segment(text)
res = []
for t in text_list:
results = model(t)
highest_label, highest_score = extract_highest_score_label(results)
result = {'sentence': t,'sentiment': highest_label, 'score': highest_score}
res.append(result)
return res
#-----------------Toxicity Prediction-----------------
def get_offensive_label(text):
#offensive language detection model
model_name = "cardiffnlp/twitter-roberta-base-offensive"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = pipeline("text-classification", model=model_name, top_k= None)
#sentence tokenize the text using pysbd
seg = pysbd.Segmenter(language="en", clean=False)
text_list = seg.segment(text)
res = []
for t in text_list:
results = model(t)
highest_label, highest_score = extract_highest_score_label(results)
result = {'sentence': t,'offensive_label': highest_label, 'score': highest_score}
res.append(result)
return res
def predict_text(text, model_name, lang='en', platform='wikipedia', date='', years=None):
if model_name == 'outcome':
return outcome(text, lang=lang, platform=platform, date=date, years=years)
elif model_name == 'stance':
return get_stance(text)
elif model_name == 'policy':
return get_policy(text)
elif model_name == 'sentiment':
return get_sentiment(text)
elif model_name == 'offensive':
return get_offensive_label(text)
else:
return "Invalid model name"