File size: 6,035 Bytes
35d7369 0d0eac6 35d7369 0d0eac6 35d7369 0d0eac6 35d7369 0d0eac6 35d7369 0d0eac6 35d7369 0d0eac6 35d7369 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# Required Libraries
from sentence_transformers import SentenceTransformer
import fitz # PyMuPDF
import os
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from qdrant_client import QdrantClient # Import Qdrant client
import uuid # Add this import at the top of your file
import json
from huggingface_hub import login
# Load environment variables from .env file
load_dotenv()
login(token=os.getenv("HF_TOKEN"))
# Initialize Qdrant client
qdrant_api_key = os.getenv("QDRANT_API_KEY") # Get the Qdrant API key from environment variables
qdrant_client = QdrantClient(
url="https://9266da83-dbfe-48d6-b2d8-cdf101299284.europe-west3-0.gcp.cloud.qdrant.io",
api_key=qdrant_api_key,
timeout=300 # 5 minutes timeout
)
# New function to load processed documents
def load_processed_docs():
try:
with open('processed_docs.json', 'r') as f:
return set(json.load(f))
except FileNotFoundError:
return set()
# New function to save processed documents
def save_processed_docs(processed_docs):
with open('processed_docs.json', 'w') as f:
json.dump(list(processed_docs), f)
# Modified function to create the Qdrant collection if it doesn't exist
def create_qdrant_collection(collection_name, vector_size):
try:
# Check if the collection already exists
collection_info = qdrant_client.get_collection(collection_name)
print(f"Collection '{collection_name}' already exists.")
except Exception as e:
# If the collection doesn't exist, create it
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config={
"size": vector_size,
"distance": "Cosine"
}
)
print(f"Collection '{collection_name}' created successfully.")
# Modified function to store embeddings in Qdrant
def store_embeddings_in_qdrant(embedded_chunks, collection_name):
points = []
for theme, embeddings in embedded_chunks.items():
for embedding in embeddings:
points.append({
"id": str(uuid.uuid4()), # Generate a UUID for each point
"vector": embedding,
"payload": {"theme": theme}
})
# Batch upload points
batch_size = 100
for i in range(0, len(points), batch_size):
batch = points[i:i+batch_size]
try:
qdrant_client.upsert(
collection_name=collection_name,
points=batch
)
print(f"Uploaded batch {i//batch_size + 1} to collection '{collection_name}'.")
except Exception as e:
print(f"Error uploading batch {i//batch_size + 1}: {e}")
print(f"Finished uploading {len(points)} points to collection '{collection_name}'.")
# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
# Step 2: Define Themes
themes = [
"Safe and Effective Systems",
"Algorithmic Discrimination Protections",
"Data Privacy",
"Notice and Explanation",
"Human Alternatives",
"Risk Management",
"Governance",
"Trustworthiness",
"Unclassified"
]
# Step 3: Chunk the Text
def chunk_text(text, themes):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(text)
thematic_chunks = {theme: [] for theme in themes}
thematic_chunks["Unclassified"] = [] # Add an "Unclassified" category
for chunk in chunks:
theme_found = False
for theme in themes:
if theme.lower() in chunk.lower():
thematic_chunks[theme].append(chunk)
theme_found = True
break
if not theme_found:
thematic_chunks["Unclassified"].append(chunk)
return thematic_chunks
# Step 4: Embed the Chunks
def embed_chunks_openai(thematic_chunks):
openai_api_key = os.getenv("OPENAI_API_KEY") # Get the API key from environment variables
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
embedded_chunks = {theme: embeddings.embed_documents(chunks) for theme, chunks in thematic_chunks.items()}
return embedded_chunks
def embed_chunks_fine_tuned(thematic_chunks):
model = SentenceTransformer("svb01/fine-tuned-embedding-model")
embedded_chunks = {theme: model.encode(chunks) for theme, chunks in thematic_chunks.items()}
return embedded_chunks
# The rest of app.py remains the same
# Modified main execution block
def main():
resources_folder = "resources"
processed_docs = load_processed_docs()
new_docs_processed = False
collection_name = "ai_info_collection"
for filename in os.listdir(resources_folder):
if filename.endswith(".pdf") and filename not in processed_docs:
pdf_path = os.path.join(resources_folder, filename)
text = extract_text_from_pdf(pdf_path)
thematic_chunks = chunk_text(text, themes)
embedded_chunks = embed_chunks_fine_tuned(thematic_chunks)
# Ensure the collection exists
if not new_docs_processed:
vector_size = len(next(iter(embedded_chunks.values()))[0])
create_qdrant_collection(collection_name, vector_size)
# Store embeddings for this document
store_embeddings_in_qdrant(embedded_chunks, collection_name)
processed_docs.add(filename)
new_docs_processed = True
print(f"Processed and added embeddings for {filename}")
if new_docs_processed:
save_processed_docs(processed_docs)
print("New documents processed and added to the collection.")
else:
print("No new documents to process.")
if __name__ == "__main__":
main() |