Spaces:
Sleeping
Sleeping
File size: 6,859 Bytes
8ad2c40 a670e13 8ad2c40 3a6a265 8ad2c40 3a6a265 8ad2c40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# Import libraries
import os
import requests
import re
from yt_dlp import YoutubeDL
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import arxiv
import numpy as np
# Access the Hugging Face token from the environment variable
HF_TOKEN = os.getenv("HF_Token")
login(token=HF_TOKEN)
# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Define paths for downloaded content and database
file_paths = {
"video": "./Machine Learning.mp4", # Replace with actual paths
"paper": "./DeepSeek_v3.pdf",
}
download_path = "./downloads"
papers_path = "./papers"
os.makedirs(download_path, exist_ok=True)
os.makedirs(papers_path, exist_ok=True)
# Load LLaMA 2
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
# Define utility functions
def compute_similarity(query_embedding, content_embeddings):
"""Compute cosine similarity between query and content embeddings."""
similarities = cosine_similarity([query_embedding], content_embeddings).flatten()
return similarities
def add_local_files(module):
"""Add local files from the database to the metadata."""
if module not in file_paths:
return []
file_path = file_paths[module]
if module == "video":
return [{"title": os.path.basename(file_path), "url": None, "file_path": file_path, "type": "video"}]
elif module == "paper":
return [{"title": os.path.basename(file_path), "url": None, "file_path": file_path, "type": "paper"}]
def download_youtube_video(video_url, output_dir, title=None):
"""Download a YouTube video using yt_dlp."""
sanitized_title = re.sub(r'[\\/*?:"<>|]', '_', title) if title else None
ydl_opts = {
'quiet': True,
'outtmpl': f"{output_dir}/{sanitized_title}.%(ext)s" if sanitized_title else f"{output_dir}/%(title)s.%(ext)s",
'format': 'best',
}
try:
with YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
return os.path.join(output_dir, f"{sanitized_title}.mp4")
except Exception as e:
print(f"Failed to download video {video_url}. Error: {e}")
return None
def fetch_and_download_youtube_video(query, output_dir="./downloads"):
"""Fetch and download the best YouTube video for a query."""
ydl_opts = {
'quiet': True,
'noplaylist': True,
'default_search': 'ytsearch',
'max_downloads': 1,
'skip_download': True,
}
try:
with YoutubeDL(ydl_opts) as ydl:
search_results = ydl.extract_info(query, download=False)
video = search_results['entries'][0] # Get the first result
video_title = video['title']
video_url = video['webpage_url']
local_path = download_youtube_video(video_url, output_dir, title=video_title)
return [{"title": video_title, "url": video_url, "file_path": local_path, "type": "video"}]
except Exception as e:
print(f"Error fetching YouTube video for query '{query}': {e}")
return []
def fetch_from_arxiv(query="machine learning", max_results=2, output_dir="./papers"):
"""Fetch papers from arXiv and download their PDFs."""
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.Relevance
)
metadata = []
for i, result in enumerate(search.results()):
pdf_url = result.pdf_url
filename = f"{query.replace(' ', '_')}_arxiv_{i}.pdf"
local_path = os.path.join(output_dir, filename)
try:
response = requests.get(pdf_url)
if response.status_code == 200:
with open(local_path, 'wb') as f:
f.write(response.content)
metadata.append({"title": result.title, "url": pdf_url, "file_path": local_path, "type": "paper"})
except Exception as e:
print(f"Error downloading paper: {e}")
return metadata
def generate_llama_response(query, context=None):
"""Generate a response using LLaMA 2."""
input_text = f"Query: {query}\n"
if context:
input_text += f"Context: {context}\n"
input_text += "Answer:"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(inputs["input_ids"], max_length=500, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
def hybrid_rag_system_with_llama(query):
"""Use LLaMA 2 to generate a final response after retrieving the best video and paper."""
modules = ["video", "paper"]
final_results = {}
query_embedding = embedding_model.encode(query)
for module in modules:
metadata = []
metadata.extend(add_local_files(module))
if module == "video":
metadata.extend(fetch_and_download_youtube_video(query, output_dir=download_path))
elif module == "paper":
metadata.extend(fetch_from_arxiv(query, max_results=2, output_dir=papers_path))
if metadata:
descriptions = [f"{item['title']} ({item['type']})" for item in metadata]
description_embeddings = [embedding_model.encode(description) for description in descriptions]
similarities = compute_similarity(query_embedding, description_embeddings)
for idx, item in enumerate(metadata):
item["similarity"] = similarities[idx]
best_match_idx = np.argmax(similarities)
final_results[module] = {
"best_match": metadata[best_match_idx],
"similarity": similarities[best_match_idx],
"all_metadata": metadata,
}
else:
final_results[module] = {"best_match": None, "similarity": None, "all_metadata": []}
video_context = f"Best Video: {final_results['video']['best_match']['title']}" if final_results['video']['best_match'] else "No relevant video found."
paper_context = f"Best Paper: {final_results['paper']['best_match']['title']}" if final_results['paper']['best_match'] else "No relevant paper found."
context = f"{video_context}\n{paper_context}"
final_response = generate_llama_response(query, context)
return final_results, final_response
# Example query
query = "short easy machine learning"
results, final_response = hybrid_rag_system_with_llama(query)
print("\nFinal Response Generated by Llama 3:")
print(final_response)
|