Spaces:
Runtime error
Runtime error
File size: 4,757 Bytes
7782f2b 44de40e 7782f2b 44de40e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import pandas as pd
from rank_bm25 import BM25Okapi
import numpy as np
from transformers import AutoTokenizer
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pandas as pd
dataset = pd.read_csv("filtered_133k_data_cleanlab.csv")
df1 = dataset[['text' , 'label' , "Chat_ID" , "x" , "y"]].dropna()
df2 = dataset[["text", "classifier_label" , "Chat_ID" , "scores_proba_countvectr" , "x" , "y"]].dropna()
df2 = df2[df2.scores_proba_countvectr > 0.9]
df2 = df2[["text" , "classifier_label" , "Chat_ID" , "x" , "y"]]
df2.columns = ["text" , "label" , "Chat_ID" , "x" , "y"]
dataset = pd.concat( (df1 , df2) ).reset_index(drop=True)
dataset = dataset.sample(frac = 1).reset_index(drop=True)
class KeyWordSearch:
def __init__(self, corpus: pd.DataFrame, tokenizer = None):
"""
"""
self.corpus = corpus
self.tokenizer = tokenizer # if you want
self.tokenized_corpus = [doc.split(" ") for doc in self.corpus['text']]
self.search_engine = BM25Okapi(self.tokenized_corpus)
def get_top_10(self , query):
tokenized_query = query.split(" ")
scores = self.search_engine.get_scores(tokenized_query)
sorted_indices = np.argsort(scores)
top_indices = []
for idx in reversed(sorted_indices):
top_indices.append(idx)
if len(top_indices) == 10:
break
top_results = []
for top_index in top_indices:
top_results.append({
"positive" : query,
"look_up": self.corpus['text'].iloc[top_index],
"score": scores[top_index],
})
top_results = pd.DataFrame(top_results)
return dict(zip(top_results.look_up.tolist() , top_results.score.tolist()))
class VectorSearch:
def __init__(self, corpus):
"""
corpus : list of text
"""
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
self.docs = self.text_splitter.create_documents(corpus)
modelPath = "omarelsayeed/bert_large_mnr"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
self.embeddings = HuggingFaceEmbeddings(
model_name=modelPath,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
self.db = FAISS.from_documents(self.docs, self.embeddings)
self.retriever = self.db.as_retriever()
def search_query(self, query):
return (pd.DataFrame([[x.page_content, y] for x, y in self.db.similarity_search_with_score(query , k=10)]),
self.db.max_marginal_relevance_search(query , k=10 , return_score=True))
import gradio as gr
import pandas as pd
df = pd.read_csv('filtered_133k_data_cleanlab.csv')
class CurrentLabel:
current_label = None
class VCC:
def __init__(self):
self.vcc = None
self.current_label = None
def filter_corpus(self, label, search_query, search_method):
corpus = df[df['label'] == label]
kw = KeyWordSearch(corpus)
# Implement your search functions (BM25 and Semantic) here and get the search results
search_results = ""
if search_method == "BM25":
return kw.get_top_10(search_query)
if search_method == "Semantic":
if CurrentLabel.current_label != label:
CurrentLabel.current_label = label
self.vcc = VectorSearch(corpus.text.tolist())
results = self.vcc.db.similarity_search_with_score(search_query , k = 10)
results = [(x.page_content , y) for x, y in results]
res = [x[0] for x in results]
score = [x[1] for x in results]
sc = [float(x) for x in score]
return dict(zip(res , sc))
# Format and return the search results as a string
if search_results == "":
search_results = "No results found."
return search_results
v = VCC()
# Create a Gradio interface
label_dropdown = gr.inputs.Dropdown(choices=list(df['label'].unique()), label="Select Label")
search_query_input = gr.inputs.Textbox(label="Search Query")
search_method_radio = gr.inputs.Radio(["BM25", "Semantic"], label="Search Method")
search_interface = gr.Interface(
fn=v.filter_corpus,
inputs=[label_dropdown, search_query_input, search_method_radio],
outputs=gr.outputs.Label(label="Search Results"),
title="Search and Filter Corpus"
)
search_interface.launch() |