Spaces:
Running
Running
import pyarabic.araby as araby | |
import pandas as pd | |
import numpy as np | |
import re | |
from datasets import load_dataset | |
from datasets import Features | |
from datasets import Value | |
from datasets import Dataset | |
from sentence_transformers import SentenceTransformer | |
from sentence_transformers.cross_encoder import CrossEncoder | |
from sklearn.metrics.pairwise import cosine_similarity | |
import os | |
import gradio as gr | |
css = """ | |
.table-wrap { | |
min-height: 300px; | |
max-height: 300px; | |
} | |
""" | |
Secret_token = os.getenv('HF_token') | |
dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token) | |
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas() | |
df = dataset["train"].to_pandas() | |
features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')}) | |
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features) | |
matn_info = dataset['train'].to_pandas() | |
matn_info = matn_info.drop(97550) | |
matn_info = matn_info.drop(307206) | |
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1) | |
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int) | |
matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0])) | |
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1])) | |
matn_info = pd.merge(matn_info, books, on='Book_ID') | |
matn_info = matn_info.reset_index() | |
df = df.reset_index() | |
cols_to_use = df.columns.difference(matn_info.columns) | |
joined_df = pd.merge(matn_info,df[cols_to_use],left_index=True, right_index=True) | |
df = joined_df.copy() | |
model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token) | |
model_CE = CrossEncoder('FDSRashid/QulBERT-CE-2.0', automodel_args = {'token':Secret_token}, max_length=512) | |
arr = np.array(df['embed'].to_list()) | |
def find_most_similar_matn(text, n): | |
prep_text = araby.strip_diacritics(text) | |
embed_text = model.encode(prep_text) | |
cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr) | |
indices = np.argsort(cos_sim)[0][-n:] | |
matns = df.iloc[indices] | |
matns['Similarity'] = cos_sim[0][indices] | |
matns_prep = [araby.strip_diacritics(text) for text in matns['matn']] | |
to_compare = [(i, prep_text) for i in matns_prep] | |
is_taraf = model_CE.predict(to_compare) | |
matns = matns[is_taraf> .5] | |
return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']] | |
with gr.Blocks(css=css) as demo: | |
text_input = gr.Textbox() | |
num_hadith = gr.Slider(1, 50, value = 5, label = 'Num Hadith', info = 'Choose the number of Hadith to Return', step = 1) | |
text_output = gr.DataFrame(wrap=True) | |
text_button = gr.Button("Retrieve") | |
text_button.click(find_most_similar_matn, inputs=[text_input, num_hadith], outputs=text_output) | |
demo.launch() | |