File size: 6,859 Bytes
8ad2c40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a670e13
8ad2c40
 
 
 
 
 
 
3a6a265
8ad2c40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a6a265
8ad2c40
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Import libraries
import os
import requests
import re
from yt_dlp import YoutubeDL
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import arxiv
import numpy as np

# Access the Hugging Face token from the environment variable
HF_TOKEN = os.getenv("HF_Token")
login(token=HF_TOKEN)

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Define paths for downloaded content and database
file_paths = {
    "video": "./Machine Learning.mp4",  # Replace with actual paths
    "paper": "./DeepSeek_v3.pdf",
}
download_path = "./downloads"
papers_path = "./papers"
os.makedirs(download_path, exist_ok=True)
os.makedirs(papers_path, exist_ok=True)

# Load LLaMA 2
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

# Define utility functions
def compute_similarity(query_embedding, content_embeddings):
    """Compute cosine similarity between query and content embeddings."""
    similarities = cosine_similarity([query_embedding], content_embeddings).flatten()
    return similarities

def add_local_files(module):
    """Add local files from the database to the metadata."""
    if module not in file_paths:
        return []
    file_path = file_paths[module]
    if module == "video":
        return [{"title": os.path.basename(file_path), "url": None, "file_path": file_path, "type": "video"}]
    elif module == "paper":
        return [{"title": os.path.basename(file_path), "url": None, "file_path": file_path, "type": "paper"}]

def download_youtube_video(video_url, output_dir, title=None):
    """Download a YouTube video using yt_dlp."""
    sanitized_title = re.sub(r'[\\/*?:"<>|]', '_', title) if title else None
    ydl_opts = {
        'quiet': True,
        'outtmpl': f"{output_dir}/{sanitized_title}.%(ext)s" if sanitized_title else f"{output_dir}/%(title)s.%(ext)s",
        'format': 'best',
    }
    try:
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
        return os.path.join(output_dir, f"{sanitized_title}.mp4")
    except Exception as e:
        print(f"Failed to download video {video_url}. Error: {e}")
        return None

def fetch_and_download_youtube_video(query, output_dir="./downloads"):
    """Fetch and download the best YouTube video for a query."""
    ydl_opts = {
        'quiet': True,
        'noplaylist': True,
        'default_search': 'ytsearch',
        'max_downloads': 1,
        'skip_download': True,
    }
    try:
        with YoutubeDL(ydl_opts) as ydl:
            search_results = ydl.extract_info(query, download=False)
            video = search_results['entries'][0]  # Get the first result
            video_title = video['title']
            video_url = video['webpage_url']
            local_path = download_youtube_video(video_url, output_dir, title=video_title)
            return [{"title": video_title, "url": video_url, "file_path": local_path, "type": "video"}]
    except Exception as e:
        print(f"Error fetching YouTube video for query '{query}': {e}")
        return []

def fetch_from_arxiv(query="machine learning", max_results=2, output_dir="./papers"):
    """Fetch papers from arXiv and download their PDFs."""
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    metadata = []
    for i, result in enumerate(search.results()):
        pdf_url = result.pdf_url
        filename = f"{query.replace(' ', '_')}_arxiv_{i}.pdf"
        local_path = os.path.join(output_dir, filename)
        try:
            response = requests.get(pdf_url)
            if response.status_code == 200:
                with open(local_path, 'wb') as f:
                    f.write(response.content)
                metadata.append({"title": result.title, "url": pdf_url, "file_path": local_path, "type": "paper"})
        except Exception as e:
            print(f"Error downloading paper: {e}")
    return metadata

def generate_llama_response(query, context=None):
    """Generate a response using LLaMA 2."""
    input_text = f"Query: {query}\n"
    if context:
        input_text += f"Context: {context}\n"
    input_text += "Answer:"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs["input_ids"], max_length=500, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def hybrid_rag_system_with_llama(query):
    """Use LLaMA 2 to generate a final response after retrieving the best video and paper."""
    modules = ["video", "paper"]
    final_results = {}
    query_embedding = embedding_model.encode(query)

    for module in modules:
        metadata = []
        metadata.extend(add_local_files(module))
        if module == "video":
            metadata.extend(fetch_and_download_youtube_video(query, output_dir=download_path))
        elif module == "paper":
            metadata.extend(fetch_from_arxiv(query, max_results=2, output_dir=papers_path))
        if metadata:
            descriptions = [f"{item['title']} ({item['type']})" for item in metadata]
            description_embeddings = [embedding_model.encode(description) for description in descriptions]
            similarities = compute_similarity(query_embedding, description_embeddings)
            for idx, item in enumerate(metadata):
                item["similarity"] = similarities[idx]
            best_match_idx = np.argmax(similarities)
            final_results[module] = {
                "best_match": metadata[best_match_idx],
                "similarity": similarities[best_match_idx],
                "all_metadata": metadata,
            }
        else:
            final_results[module] = {"best_match": None, "similarity": None, "all_metadata": []}
    video_context = f"Best Video: {final_results['video']['best_match']['title']}" if final_results['video']['best_match'] else "No relevant video found."
    paper_context = f"Best Paper: {final_results['paper']['best_match']['title']}" if final_results['paper']['best_match'] else "No relevant paper found."
    context = f"{video_context}\n{paper_context}"
    final_response = generate_llama_response(query, context)
    return final_results, final_response

# Example query
query = "short easy machine learning"
results, final_response = hybrid_rag_system_with_llama(query)
print("\nFinal Response Generated by Llama 3:")
print(final_response)