from sentence_transformers import SentenceTransformer model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') import pandas as pd import numpy as np import pickle import nltk from nltk.tokenize import sent_tokenize from tqdm import tqdm tqdm.pandas() nltk.download('punkt') from numpy import dot from numpy.linalg import norm import json import ast import requests import gradio as gr from datetime import datetime import time import dataframe_image as dfi print("Packages loaded!") # write out functions def load_pickle(): master_exploded = pickle.load(open("./Data/master_exploded_current.pkl", 'rb')) print("Exploded DF Shape:", master_exploded.shape) print("Successfully Loaded!") return master_exploded def sentence_embedding_generator(query): # query = input('What kind of mentor are you looking for?: ') print(f'You entered {query}') print("Loading Model...") model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') print("all-mpnet-base-v2 Model loaded!") embeddings = model.encode(query) return embeddings, query def cosine_similarity_generator(master_exploded, embeddings, query, filename = time.strftime("%Y%m%d-%H%M%S")): # current_datetime = datetime.now() print("Current datetime: ", time.strftime("%Y%m%d-%H%M%S")) master_exploded['query'] = query master_exploded['cos_sim'] = master_exploded['raw_embedding'].progress_apply(lambda x: (np.dot(embeddings, x) / (norm(embeddings)*norm(x)))) master_exploded_top = master_exploded[master_exploded['cos_sim']> 0.6] print("The number of results with cosine similarity > 0.6 are: ", len(master_exploded[master_exploded['cos_sim']> 0.6])) top_k = master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10) print(" The top k=10 results have a min cosine similarity of: ", master_exploded.sort_values(by=['cos_sim'], ascending=False).head(10)['cos_sim'].min()) # print(master_exploded_top_k) cosine_sum_by_name = master_exploded_top.groupby(["id", "name", "tokenized_sentences" ]).agg({"cos_sim": ["sum"]}).reset_index() print("Taking sum of cosine similarities above 0.6 threshold...") cosine_sum_by_name.columns = cosine_sum_by_name.columns.map('_'.join) ranked_mentors = cosine_sum_by_name.reset_index().sort_values(by ="cos_sim_sum", ascending =False) cosine_sum_by_name = cosine_sum_by_name.rename(columns={"id_": "MentorID", "name_": "Name", "tokenized_sentences_": "Sentences"}, errors="raise") # path = "./Ranked_Results_Gradio/" # ranked_mentors_filename = path+'ranked_mentors_'+str(filename)+'.csv' # cos_sum_filename = path+'cos_sum_'+str(filename)+'.csv' # top_10_filename = path+'top_10_'+str(filename)+'.csv' # above_threshold_filename = path+"above_0.6_threshold_"+str(filename)+".csv" # save 3 files: Ranked mentors, top 10 matches baed on cosine similarity sum, and then all of the results per run. # ranked_mentors.head(10).to_csv(ranked_mentors_filename) # cosine_sum_by_name.sort_values(by ="cos_sim_sum", ascending =False).head(10).to_csv(top_10_filename) # cosine_sum_by_name.to_csv(cos_sum_filename) # master_exploded_top.sort_values(by ="cos_sim", ascending =False).to_csv(above_threshold_filename) return master_exploded_top, top_k, cosine_sum_by_name def dataframe_output(cosine_sum_by_name): # return master_exploded_top_k json_df = cosine_sum_by_name.to_json(orient="columns") return json_df def generate_results(input): master_exploded = load_pickle() embeddings, query = sentence_embedding_generator(str(input)) ranked_mentors, top_10, cosine_sum_by_name = cosine_similarity_generator(master_exploded, embeddings, query, time.strftime("%Y%m%d-%H%M%S")) print(cosine_sum_by_name.columns) df_output = pd.read_json(dataframe_output(cosine_sum_by_name)) print(df_output) # df_output = dataframe_output(cosine_sum_by_name) top_10 = top_10[['name','id', 'tokenized_sentences', 'cos_sim' ]] sentence_output = pd.read_json(dataframe_output(top_10)) print("JSON created...") subset = df_output.head(10) # Select the first 10 rows return subset, sentence_output iface = gr.Interface( fn=generate_results, inputs=gr.inputs.Textbox(label="What kind of mentor are you looking for?"), outputs=[gr.outputs.Dataframe(type="pandas"), gr.outputs.Dataframe(type="pandas")], title="SharpestMinds Mentor Recommender Semantic Search App", description="Converts a string query into an embedding, and then compares the aggregate cosine similarity by mentor.", ) iface.launch(auth=("admin", "russell2023"))