import gradio as gr from pyvis.network import Network import networkx as nx import numpy as np import pandas as pd import os from datasets import load_dataset from datasets import Features from datasets import Value from datasets import Dataset import matplotlib.pyplot as plt import re from collections import defaultdict from huggingface_hub import hf_hub_download import json pattern = r'"(.*?)"' # this pattern captures anything in a double quotes. Secret_token = os.getenv('HF_token') dataset = load_dataset('FDSRashid/hadith_info',data_files = 'Basic_Edge_Information.csv', token = Secret_token, split = 'train') edge_info = dataset.to_pandas() features = Features({'Rawi ID': Value('int32'), 'Famous Name': Value('string'), 'Narrator Rank': Value('string'), 'Number of Narrations': Value('string'), 'Generation': Value('string')}) narrator_bios = load_dataset("FDSRashid/hadith_info", data_files = 'Teacher_Bios.csv', token = Secret_token,features=features ) narrator_bios = narrator_bios['train'].to_pandas() narrator_bios.loc[49845, 'Narrator Rank'] = 'رسول الله' narrator_bios.loc[49845, 'Number of Narrations'] = 0 narrator_bios['Number of Narrations'] = narrator_bios['Number of Narrations'].astype(int) narrator_bios.loc[49845, 'Number of Narrations'] = 327512 # 8125 Narrators have no Generation, listed in dataset as None narrator_bios['Generation'] = narrator_bios['Generation'].replace([None], [-1]) narrator_bios['Generation'] = narrator_bios['Generation'].astype(int) features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')}) dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features) matn_info = dataset['train'].to_pandas() matn_info = matn_info.drop(97550) matn_info = matn_info.drop(307206) matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1) matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int) # Isnad Info Hadiths column is structured like {"BookNum_HadithNum", ...} for each edge isnad_info = load_dataset('FDSRashid/hadith_info',token = Secret_token, data_files = 'isnad_info.csv', split = 'train').to_pandas() isnad_info['Hadiths Cleaned'] = isnad_info['Hadiths'].apply(lambda x: [re.findall(pattern, string)[0].split("_") for string in x[1:-1].split(',')]) # Hadiths Cleaned is a list of lists, each sub-list is Book Id, Hadith ID taraf_max = np.max(matn_info['taraf_ID'].unique()) isnad_info['Tarafs Cleaned'] = isnad_info['Tarafs'].apply(lambda x: np.array([int(i.strip(' ')) for i in x[1:-1].split(',')])) cmap = plt.colormaps['cool'] books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token = Secret_token)['train'].to_pandas() matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0])) matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1])) matn_info = pd.merge(matn_info, books, on='Book_ID') # Preprocess narrator_bios into a dictionary narrator_info = narrator_bios.set_index('Rawi ID').to_dict(orient='index') # Download and read a file file_path = hf_hub_download( repo_id="FDSRashid/hadith_info", # read in fast lookup data structure filename="hadith_lookup.json", repo_type="dataset", token=Secret_token, ) with open(file_path, 'r') as f: hadith_lookup_dict = json.load(f) HADITH_LOOKUP = defaultdict(list, hadith_lookup_dict) def value_to_hex(value): rgba_color = cmap(value) return "#{:02X}{:02X}{:02X}".format(int(rgba_color[0] * 255), int(rgba_color[1] * 255), int(rgba_color[2] * 255)) def get_node_info(node): node = int(node) # Ensure node is an integer info = narrator_info.get(node, {}) student_narrations = info.get('Number of Narrations', 1) student_gen = info.get('Generation', -1) student_rank = info.get('Narrator Rank', 'فلان') node_name = info.get('Famous Name', 'فلان') return info, student_narrations, student_gen, student_rank, node_name def lookup_hadith(taraf_hadith, hadith_lookup): """ Returns a list of unique elements from the hadith_lookup for the given taraf_hadith. Parameters: taraf_hadith (str or list of str): A string or list of strings to look up. hadith_lookup (defaultdict): A defaultdict containing the hadith data. Returns: list: A list of unique elements from the lookup results. """ # Ensure taraf_hadith is always a list if isinstance(taraf_hadith, str): taraf_hadith = [taraf_hadith] # Create a set to accumulate unique elements unique_elements = {elem for key in taraf_hadith for elem in hadith_lookup[key]} # Convert the set to a list for consistency return list(unique_elements) def visualize_isnad(taraf_num, yaxis): # Precompute filtered dataframes taraf = matn_info[matn_info['taraf_ID'] == taraf_num] taraf_hadith = taraf['bookid_hadithid'].to_list() # Precompute hadiths where taraf_num exists hadith_cleaned = isnad_info['Tarafs Cleaned'].apply(lambda x: taraf_num in x) isnad_hadith = isnad_info[hadith_cleaned] lst_hadith = [] for i, hadith_parts in enumerate(taraf_hadith): # look up hadith for each bookid_hadithid isnad_hadith1 = isnad_info.iloc[lookup_hadith(taraf_hadith[i], HADITH_LOOKUP)][['Source', 'Destination']] # Create graph and find end nodes G = nx.from_pandas_edgelist(isnad_hadith1, source='Source', target='Destination', create_using=nx.DiGraph()) nodes = [int(n) for n, d in G.out_degree() if d == 0] if nodes: # Batch fetch data from narrator_bios for efficiency bio_data = narrator_bios[narrator_bios['Rawi ID'].isin(nodes)] for n in nodes: gen_node = bio_data.loc[bio_data['Rawi ID'] == n, 'Generation'].squeeze() gen_node = gen_node if pd.notna(gen_node) else -1 name_node = bio_data.loc[bio_data['Rawi ID'] == n, 'Famous Name'].squeeze() name_node = name_node if pd.notna(name_node) else 'فلان' # Append result for each node lst_hadith.append([ taraf.iloc[i]['matn'], gen_node, name_node, taraf.iloc[i]['Book_Name'], taraf.iloc[i]['Author'], taraf.iloc[i]['Hadith Number'], n, i ]) # Convert to DataFrame df = pd.DataFrame(lst_hadith, columns=['Matn', 'Generation', 'Name', 'Book_Name', 'Author', 'Book Hadith Number', 'End Transmitter ID', 'Hadith Number']) isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(int) # Merge isnad_hadith with narrator_bios for Teacher and Student isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Source', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Teacher'}) isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Destination', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Student'}) isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(str) # Fill missing values with 'فلان' # isnad_hadith['Teacher'].fillna('فلان', inplace=True) # isnad_hadith['Student'].fillna('فلان', inplace=True) end_nodes = df['End Transmitter ID'].tolist() G = nx.from_pandas_edgelist(isnad_hadith, source = 'Source', target = 'Destination', create_using = nx.DiGraph()) isnad_pos = nx.nx_agraph.graphviz_layout(G, prog='dot') x_stretch = 4 y_stretch = 4 net = Network(directed =True, select_menu=True, cdn_resources='remote') # Precompute end_matn_info for each end node end_node_data = df.groupby('End Transmitter ID').apply(lambda x: " ".join(x["Hadith Number"].astype("string"))).to_dict() # Loop over isnad_pos for node, pos in isnad_pos.items(): node_info, student_narrations, student_gen, student_rank, node_name = get_node_info(node) label = f'{node_name} \n {student_rank} \n ID: {node} - Gen {student_gen}' size = 50 font_color = 'red' if node == '99999': label = f'{node_name} \n ID: {node} - Gen {student_gen}' size = 70 font_color = 'black' elif int(node) in end_nodes: hadith_numbers = end_node_data.get(int(node), '') label += f' \n Hadith {hadith_numbers}' net.add_node(node, font={'size': 30, 'color': font_color}, color=value_to_hex(student_narrations), label=label, x=pos[0] * x_stretch, y=-pos[1] * y_stretch, size=size) # Add edges efficiently edge_data = isnad_hadith[['Source', 'Destination', f'{yaxis} Count']].values for source, target, count in edge_data: net.add_edge(source, target, color=value_to_hex(int(count)), label=f"{count}") net.toggle_physics(False) html = net.generate_html() html = html.replace("'", "\"") df = df.rename(columns = {'Generation': 'Gen.', 'Book Hadith Number': 'Hdth Num', 'End Transmitter ID': 'End Narrator ID', 'Hadith Number': 'Index', 'Book_Name': 'Book', 'Name':'Final Narrator'}) return f"""""" , df.drop('Hdth Num', axis=1) def visualize_subTaraf(taraf_num, hadith_str, yaxis): hadith_list = hadith_str.split(',') hadith_list = [hadith.strip() for hadith in hadith_list] hadiths = np.array([], dtype=int) for hadith in hadith_list: if '-' in hadith: if hadith.count('-') > 1: #print('Please use only one Dash mark!') raise gr.Error('Please use only one Dash mark!') hadith_multi = hadith.strip().split('-') if any([not had.isnumeric() for had in hadith_multi]): #print('Invalid Begining') raise gr.Error('Invalid Begining') elif len(hadith_multi) != 2: #print('Two numbers for a range of Hadith numbers please!') raise gr.Error('Two numbers for a range of Hadith numbers please!') hadith_multi = [int(had) for had in hadith_multi] hadiths = np.append(hadiths, np.arange(hadith_multi[0], hadith_multi[1] +1)) elif hadith.isnumeric(): hadiths = np.append(hadiths, int(hadith)) else: #print('Invalid Data format!') raise gr.Error("Invalid Data format!") hadiths= np.unique(hadiths) taraf = matn_info[matn_info['taraf_ID'] == taraf_num] num_hadith = taraf.shape[0] if np.max(hadiths) > num_hadith: raise gr.Error(f'Hadith index outside of range. Total Number of Hadith in this Taraf: {num_hadith}') taraf['Index'] = np.arange(num_hadith) sub_taraf = taraf[taraf['Index'].isin(hadiths)] isnad_hadith = isnad_info.iloc[lookup_hadith(sub_taraf['bookid_hadithid'].to_list(), HADITH_LOOKUP)] isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(int) # Merge isnad_hadith with narrator_bios for Teacher and Student isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Source', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Teacher'}) isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Destination', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Student'}) isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(str) # isnad_hadith['Teacher'].fillna('فلان', inplace=True) # isnad_hadith['Student'].fillna('فلان', inplace=True) G = nx.from_pandas_edgelist(isnad_hadith, source = 'Source', target = 'Destination', create_using = nx.DiGraph()) isnad_pos = nx.nx_agraph.graphviz_layout(G, prog='dot') x_stretch = 4 y_stretch = 4 net = Network(directed =True, select_menu=True, cdn_resources='remote') for node, pos in isnad_pos.items(): node_info,student_narrations,student_gen, student_rank, node_name = get_node_info(node) if node == '99999': net.add_node(node, font = {'size':50, 'color': 'black'}, color = '#000000', label = f'{node_name} \n ID: {node} - Gen {student_gen}', x= pos[0]*x_stretch, y= -1*pos[1]*y_stretch, size= 70) else: net.add_node(node, font = {'size':30, 'color': 'red'}, color = value_to_hex(student_narrations), label = f'{node_name} \n {student_rank} \n ID: {node} - Gen {student_gen}', x= pos[0]*x_stretch, y= -1*pos[1]*y_stretch, size= 50) for _, row in isnad_hadith.iterrows(): source = row['Source'] target = row['Destination'] net.add_edge(source, target, color = value_to_hex(int(row[f'{yaxis} Count'])), label = f"{row[f'{yaxis} Count']}") net.toggle_physics(False) html = net.generate_html() html = html.replace("'", "\"") return f"""""", sub_taraf[['matn', 'Book_Name', 'Author', 'Book_ID', 'Hadith Number']] def taraf_booknum(taraf_num): taraf = matn_info[matn_info['taraf_ID'] == taraf_num] num_hadith = taraf.shape[0] taraf['Index'] = np.arange(num_hadith) return taraf[['matn', 'Book_ID', 'Hadith Number', 'Book_Name', 'Author', 'Index']] def visualize_hadith_isnad(df, yaxis): df['bookid_hadithid'] = df['Book_ID'].astype(str) + '_' + df['Hadith Number'].astype(str) hadith = matn_info[matn_info['bookid_hadithid'].isin(df['bookid_hadithid'])] taraf_hadith = df['bookid_hadithid'].to_list() isnad_hadith = isnad_info.iloc[lookup_hadith(taraf_hadith, HADITH_LOOKUP)] isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(int) # Merge isnad_hadith with narrator_bios for Teacher and Student isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Source', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Teacher'}) isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Destination', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Student'}) isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(str) # isnad_hadith['Teacher'].fillna('فلان', inplace=True) # isnad_hadith['Student'].fillna('فلان', inplace=True) G = nx.from_pandas_edgelist(isnad_hadith, source = 'Source', target = 'Destination', create_using = nx.DiGraph()) isnad_pos = nx.nx_agraph.graphviz_layout(G, prog='dot') x_stretch = 4 y_stretch = 4 net = Network(directed =True, select_menu=True, cdn_resources='remote') for node, pos in isnad_pos.items(): node_info,student_narrations,student_gen, student_rank, node_name = get_node_info(node) if node == '99999': net.add_node(node, font = {'size':50, 'color': 'black'}, color = '#000000', label = f'{node_name} \n ID: {node} - Gen {student_gen}', x= pos[0]*x_stretch, y= -1*pos[1]*y_stretch, size= 70) else: net.add_node(node, font = {'size':30, 'color': 'red'}, color = value_to_hex(student_narrations), label = f'{node_name} \n {student_rank} \n ID: {node} - Gen {student_gen}', x= pos[0]*x_stretch, y= -1*pos[1]*y_stretch, size= 50) for _, row in isnad_hadith.iterrows(): source = row['Source'] target = row['Destination'] net.add_edge(source, target, color = value_to_hex(int(row[f'{yaxis} Count'])), label = f"{row[f'{yaxis} Count']}") net.toggle_physics(False) html = net.generate_html() html = html.replace("'", "\"") return f"""""" , hadith[['matn', 'Book_ID', 'Hadith Number', 'Book_Name', 'Author', 'taraf_ID']] def visualize_narrator_taraf(taraf_num, narrator, yaxis): taraf = matn_info[matn_info['taraf_ID'] == taraf_num].copy() taraf['Index'] = np.arange(len(taraf)) hadith_cleaned = isnad_info['Tarafs Cleaned'].apply(lambda x: taraf_num in x) isnad_hadith = isnad_info[hadith_cleaned] isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(int) # Merge isnad_hadith with narrator_bios for Teacher and Student isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Source', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Teacher'}) isnad_hadith = isnad_hadith.merge(narrator_bios[['Rawi ID', 'Famous Name']], left_on='Destination', right_on='Rawi ID', how='left').rename(columns={'Famous Name': 'Student'}) isnad_hadith[['Source', 'Destination']] = isnad_hadith[['Source', 'Destination']].astype(str) taraf_hadith = taraf['bookid_hadithid'].to_list() # original graph of whole taraf G = nx.from_pandas_edgelist(isnad_hadith, source = 'Source', target = 'Destination', create_using = nx.DiGraph()) if narrator not in G.nodes(): raise gr.Error('Narrator not in Isnad of Taraf!') matns_with_narrator = [] end_node = {} # Process each hadith in taraf_hadith_split for idx, split_hadith in enumerate(taraf_hadith): isnad_hadith1 = isnad_info.iloc[lookup_hadith(taraf_hadith[idx], HADITH_LOOKUP)] G1 = nx.from_pandas_edgelist(isnad_hadith1, source='Source', target='Destination', create_using=nx.DiGraph()) if narrator in G1.nodes: matns_with_narrator.append(taraf_hadith[idx]) for node in (n for n, d in G1.out_degree() if d == 0): end_node.setdefault(node, []).append(str(idx)) # Update the graph list_of_lists = [hadith_lookup[i] for i in matns_with_narrator] flattened = list(set([elem for sublist in list_of_lists for elem in sublist])) isnad_hadith = isnad_info.iloc[flattened][['Source', 'Destination']] G = nx.from_pandas_edgelist(isnad_hadith, source = 'Source', target = 'Destination', create_using = nx.DiGraph()) isnad_pos = nx.nx_agraph.graphviz_layout(G, prog='dot') narrator_matn_info = taraf[taraf['bookid_hadithid'].isin(matns_with_narrator)] narrator_matn_info['Subset Index'] = np.arange(len(narrator_matn_info)) # Visualization with pyvis x_stretch = 4 y_stretch = 4 net = Network(directed =True, select_menu=True, cdn_resources='remote') for node, pos in isnad_pos.items(): node_info, student_narrations, student_gen, student_rank, node_name = get_node_info(node) label = f'{node_name} \n ID: {node} - Gen {student_gen}' size = 70 if node == '99999' else 50 font_color = 'black' if node == '99999' else 'red' hadiths = f" \n Hadiths {', '.join(end_node[node])}" if node in end_node else '' net.add_node(node, font={'size': 30, 'color': font_color}, color=value_to_hex(student_narrations), label=f"{label} {hadiths}", x=pos[0] * x_stretch, y=-pos[1] * y_stretch, size=size) for edge in G.edges: row = isnad_hadith[(isnad_hadith['Source'] == edge[0]) & (isnad_hadith['Destination'] == edge[1])].iloc[0] net.add_edge(edge[0], edge[1], color=value_to_hex(int(row[f'{yaxis} Count'])), label=f"{row[f'{yaxis} Count']}") net.toggle_physics(False) html = net.generate_html() html = html.replace("'", "\"") return f"""""" , narrator_matn_info[['matn', 'Book_Name', 'Author', 'Book_ID', 'Hadith Number', 'Index', 'Subset Index']] with gr.Blocks() as demo: with gr.Tab("Whole Taraf Visualizer"): Yaxis = gr.Dropdown(choices = ['Taraf', 'Hadith', 'Isnad', 'Book'], value = 'Taraf', label = 'Variable to Display', info = 'Choose the variable to visualize.') taraf_number = gr.Slider(1,taraf_max , value=10000, label="Taraf", info="Choose the Taraf to Input", step = 1) btn = gr.Button('Submit') # btn.click(fn = visualize_isnad, inputs = [taraf_number, Yaxis], outputs = [gr.HTML(), gr.DataFrame(wrap=True, column_widths=[43, 8, 11,11,10,8, 9])]) with gr.Tab("Book and Hadith Number Retriever"): taraf_num = gr.Slider(1,taraf_max , value=10000, label="Taraf", info="Choose the Taraf to Input", step = 1) btn_num = gr.Button('Retrieve') btn_num.click(fn=taraf_booknum, inputs = [taraf_num], outputs= [gr.DataFrame(wrap=True)]) with gr.Tab('Sub Taraf Visualizer'): taraf_num = gr.Slider(1,taraf_max , value=10000, label="Taraf", info="Choose the Taraf to Input", step = 1) Yaxis = gr.Dropdown(choices = ['Taraf', 'Hadith', 'Isnad', 'Book'], value = 'Taraf', label = 'Variable to Display', info = 'Choose the variable to visualize.') hadith_str = gr.Textbox(label='Hadith Selection', info='Choose which range of Hadith you would like visualized from the Taraf (eg "1, 2, 4-7")') btn_sub = gr.Button('Visualize') btn_sub.click(fn=visualize_subTaraf, inputs = [taraf_num, hadith_str, Yaxis], outputs=[gr.HTML(), gr.DataFrame(wrap=True)]) with gr.Tab('Select Hadith Isnad Visualizer'): yyaxis = gr.Dropdown(choices = ['Taraf', 'Hadith', 'Isnad', 'Book'], value = 'Taraf', label = 'Variable to Display', info = 'Choose the variable to visualize.') hadith_selection = gr.Dataframe( headers=["Book_ID", "Hadith Number"], datatype=["number", "number"], row_count=5, col_count=(2, "fixed")) btn_hadith = gr.Button('Visualize') btn_hadith.click(fn=visualize_hadith_isnad, inputs=[hadith_selection, yyaxis], outputs=[gr.HTML(), gr.DataFrame(wrap=True)]) with gr.Tab('Taraf Narrator Isnad Visualizer'): Yaxis = gr.Dropdown(choices = ['Taraf', 'Hadith', 'Isnad', 'Book'], value = 'Taraf', label = 'Variable to Display', info = 'Choose the variable to visualize.') taraf_number = gr.Slider(1,taraf_max , value=10000, label="Taraf", info="Choose the Taraf to Input", step = 1) narr = gr.Textbox(label='Narrator', info='Choose a Narrator (Refer to full isnad from previous tab)') btn_narr = gr.Button('Visualize') btn_narr.click(fn=visualize_narrator_taraf, inputs=[taraf_number, narr, Yaxis], outputs=[gr.HTML(), gr.DataFrame(wrap=True)]) demo.launch()