|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
DEFAULT_CHUNK_SIZE = 100 |
|
DEFAULT_CHUNK_OVERLAP = 0 |
|
DEFAULT_NUM_CHUNKS = 10 |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks): |
|
""" |
|
Tokenizes the input text based on the selected method and provided parameters. |
|
""" |
|
num_chunks = int(num_chunks) |
|
output = [] |
|
|
|
|
|
if not text.strip(): |
|
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count']) |
|
|
|
if method == "RecursiveCharacterTextSplitter": |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False) |
|
tokenized_texts = text_splitter.split_text(text)[:num_chunks] |
|
for i, chunk in enumerate(tokenized_texts): |
|
output.append({ |
|
'Chunk #': i, |
|
'Text Chunk': chunk, |
|
'Character Count': len(chunk), |
|
'Token Count': len(chunk.split()) |
|
}) |
|
|
|
df = pd.DataFrame(output) |
|
return df |
|
|
|
def calculate_embeddings(df): |
|
""" |
|
Calculates embeddings for each text chunk in the dataframe. |
|
""" |
|
if df.empty: |
|
return df |
|
|
|
chunks = df['Text Chunk'].tolist() |
|
embeddings = model.encode(chunks) |
|
df['Embeddings'] = embeddings.tolist() |
|
return df |
|
|
|
def search_similar_chunks(query, df_with_embeddings): |
|
""" |
|
Search for chunks similar to the query embedding. |
|
""" |
|
|
|
query_embedding = model.encode([query])[0] |
|
|
|
|
|
chunk_embeddings = np.vstack(df_with_embeddings['Embeddings']) |
|
similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0] |
|
|
|
|
|
df_with_embeddings.insert(1, 'Similarity', similarity_scores) |
|
|
|
|
|
return df_with_embeddings.sort_values(by='Similarity', ascending=False) |
|
|
|
def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks): |
|
""" |
|
Tokenizes the text and calculates embeddings. |
|
""" |
|
df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks) |
|
df_with_embeddings = calculate_embeddings(df) |
|
return df_with_embeddings |
|
|
|
def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query): |
|
df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks) |
|
if query: |
|
df_with_embeddings = search_similar_chunks(query, df_with_embeddings) |
|
|
|
return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']] |
|
return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']] |
|
|
|
iface = gr.Interface( |
|
fn=update_output, |
|
inputs=[ |
|
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]), |
|
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."), |
|
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE), |
|
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP), |
|
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS), |
|
gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.") |
|
], |
|
outputs=gr.Dataframe(height=900), |
|
title="Text Tokenization and Embedding Tool", |
|
description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|