Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
import PyPDF2 | |
import spacy | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
import torch | |
import gradio as gr | |
# Load and preprocess PDF text | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
with open(pdf_path, 'rb') as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
# Path to your PDF file | |
pdf_path = 'FridayMaster/UBANTUMANUAL/Getting Started with Ubuntu 16.04.pdf' | |
# Extract text from the PDF | |
pdf_text = extract_text_from_pdf(pdf_path) | |
# Convert the text to a DataFrame | |
df = pd.DataFrame({'text': [pdf_text]}) | |
# Load the custom embedding model | |
class CustomEmbeddingModel: | |
def __init__(self, model_name): | |
self.model = SentenceTransformer(model_name) | |
def embed_text(self, text): | |
return self.model.encode(text, convert_to_tensor=True) | |
embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name | |
# Load Spacy model for preprocessing | |
nlp = spacy.load("en_core_web_sm") | |
def preprocess_text(text): | |
doc = nlp(text) | |
tokens = [token.lemma_.lower() for token in doc if token.is_alpha] | |
return ' '.join(tokens) | |
# Apply preprocessing and embedding | |
df['text'] = df['text'].apply(preprocess_text) | |
df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x)) | |
# Create a FAISS index | |
index = faiss.IndexFlatL2(768) # Assuming embeddings are 768-dimensional | |
embeddings = torch.stack(df['text_embeddings'].tolist()) | |
faiss_index = faiss.IndexFlatL2(embeddings.shape[1]) | |
faiss_index.add(embeddings.numpy()) | |
# Function to generate a response | |
def generate_response(prompt): | |
query_embedding = embedding_model.embed_text(prompt).unsqueeze(0) | |
distances, indices = faiss_index.search(query_embedding.numpy(), k=1) | |
response = df.iloc[indices[0][0]]['text'] | |
return response | |
# Gradio interface | |
iface = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."), | |
outputs=gr.Textbox(label="Response"), | |
title="Ubuntu Manual Chatbot", | |
description="Ask questions about the Ubuntu manual." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |