|
|
|
import numpy as np |
|
import gradio as gr |
|
import os |
|
import pandas as pd |
|
from datasets import load_dataset |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from datasets import Features, Value |
|
import plotly.express as px |
|
|
|
features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')}) |
|
|
|
|
|
Secret_token = os.getenv('HF_token') |
|
|
|
dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token) |
|
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas() |
|
df = dataset["train"].to_pandas() |
|
|
|
dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features) |
|
matn_info = dataset['train'].to_pandas() |
|
matn_info = matn_info.drop(97550) |
|
matn_info = matn_info.drop(307206) |
|
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1) |
|
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int) |
|
|
|
matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0])) |
|
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1])) |
|
matn_info = pd.merge(matn_info, books, on='Book_ID') |
|
|
|
|
|
matn_info = matn_info.reset_index() |
|
df = df.reset_index() |
|
cols_to_use = df.columns.difference(matn_info.columns) |
|
joined_df = pd.merge(matn_info,df[cols_to_use],left_index=True, right_index=True) |
|
df = joined_df.copy() |
|
taraf_max = np.max(df['taraf_ID'].unique()) |
|
|
|
def plot_similarity_score(taraf_num): |
|
taraf_df = df[df['taraf_ID']== taraf_num] |
|
taraf_df['Number'] = np.arange(len(taraf_df)) |
|
embed_taraf = taraf_df['embed'].to_list() |
|
cos_score = cosine_similarity(embed_taraf) |
|
fig = px.imshow(cos_score) |
|
matr = cos_score |
|
rows, cols = matr.shape |
|
mask = np.tril(np.ones((rows, cols), dtype=bool), k=-1) |
|
lower_triangle = matr[mask] |
|
data = lower_triangle.flatten() |
|
fig_dis = px.histogram(x = data, title = f'Similarity Distribution for Taraf {taraf_num}', labels = {'x': 'Similarity Score'}, nbins = 20, template = 'ggplot2' ) |
|
return fig, fig_dis, taraf_df[['matn', 'Number', 'Book_Name', 'Author', 'Hadith Number']] |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown('# Semantic Similarity Visualizer') |
|
taraf_number = gr.Slider(1,taraf_max , value=10000, label="Taraf", info="Choose the Taraf to Input", step = 1) |
|
btn = gr.Button('Submit') |
|
btn.click(fn = plot_similarity_score, inputs = [taraf_number], outputs = [gr.Plot(),gr.Plot(), gr.DataFrame(wrap=True)]) |
|
demo.launch() |
|
|