Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

App Files Files Community

Omar Solano commited on Jul 27, 2024

Commit

377744c

1 Parent(s): 3f59041

update scraping scripts

Browse files

Files changed (10) hide show

data/scraping/huggingface_docs/parse_hf_html.py +0 -166
data/scraping/huggingface_docs/scrape_hf_docs_from_repo.py +0 -57
data/scraping/huggingface_docs/scrape_hf_docs_from_web.py +0 -134
data/scraping/huggingface_docs/validate_jsonl.py +0 -51
data/scraping_scripts/create_db.ipynb +389 -0
data/scraping_scripts/create_jsonl_file_hf.py +154 -0
data/scraping_scripts/create_jsonl_file_llama.py +123 -0
data/scraping_scripts/get_md_files_from_repo.py +137 -0
scripts/call_openai.py +0 -79
scripts/create_db.ipynb +0 -0

data/scraping/huggingface_docs/parse_hf_html.py DELETED Viewed

@@ -1,166 +0,0 @@
-import io
-import json
-import os
-from pathlib import Path
-from urllib.parse import urljoin
-import pandas as pd
-from bs4 import BeautifulSoup
-from tqdm import tqdm
-class HuggingfaceParser:
-    def __init__(self, html, url):
-        self.soup = BeautifulSoup(html, "html.parser")
-        self.url = url
-    def find_sections(self):
-        sections = []
-        main_content = self.soup.find("article", class_="md-content__inner")
-        if not main_content:
-            main_content = self.soup.find(
-                "div", class_="main-container"
-            )  # Look for main container
-        if not main_content:
-            main_content = self.soup.find(
-                "body"
-            )  # Fallback to body if nothing else found
-        if not main_content:
-            print(f"Error: No main content found for {self.url}")
-            return sections
-        # Try to find headers
-        headers = main_content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
-        if not headers:
-            # If no headers, look for other structural elements
-            headers = main_content.find_all(
-                ["div", "p"], class_=["docstring", "section"]
-            )
-        if not headers:
-            print(f"Warning: No headers or sections found in {self.url}")
-            # If still no headers, treat the whole content as one section
-            title = self.soup.title.string if self.soup.title else "Untitled"
-            sections.append(
-                {
-                    "name": title,
-                    "url": self.url,
-                    "content": main_content.get_text(strip=True),
-                    "level": 1,
-                }
-            )
-            return sections
-        for i, header in enumerate(headers):
-            name = header.text.strip()
-            header_id = header.get("id", "")
-            if header_id:
-                section_url = f"{self.url}#{header_id}"
-            else:
-                section_url = self.url
-            content = self.extract_content(
-                header, headers[i + 1] if i + 1 < len(headers) else None
-            )
-            sections.append(
-                {
-                    "name": name,
-                    "url": section_url,
-                    "content": content,
-                    "level": self.get_header_level(header),
-                }
-            )
-        return sections
-    def extract_content(self, start_tag, end_tag):
-        content = []
-        current = start_tag.next_sibling
-        while current and current != end_tag:
-            if isinstance(current, str):
-                content.append(current.strip())
-            elif current.name == "table":
-                table_html = io.StringIO(str(current))
-                content.append(
-                    pd.read_html(table_html)[0].to_markdown(
-                        index=False, tablefmt="github"
-                    )
-                )
-            elif current.name not in ["script", "style"]:
-                content.append(current.get_text(strip=True, separator=" "))
-            current = current.next_sibling
-        return "\n".join(filter(None, content))
-    def get_header_level(self, tag):
-        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            return int(tag.name[1])
-        elif "class" in tag.attrs:
-            if "docstring" in tag["class"]:
-                return 1
-            elif "section" in tag["class"]:
-                return 2
-        return 1  # Default level
-def is_likely_html_file(file_path):
-    excluded_extensions = {".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg"}
-    return file_path.suffix == "" or file_path.suffix.lower() not in excluded_extensions
-def parse_saved_html_files(html_dir, base_url):
-    all_sections = []
-    html_files = [
-        f for f in Path(html_dir).rglob("*") if f.is_file() and is_likely_html_file(f)
-    ]
-    print(f"Found {len(html_files)} HTML files")
-    for html_file in tqdm(html_files, desc="Parsing HTML files"):
-        try:
-            with open(html_file, "r", encoding="utf-8") as file:
-                html_content = file.read()
-            relative_path = html_file.relative_to(html_dir)
-            url = urljoin(base_url, str(relative_path).replace(os.path.sep, "/"))
-            parser = HuggingfaceParser(html_content, url)
-            sections = parser.find_sections()
-            if not sections:
-                print(f"Warning: No sections found in {html_file}")
-                # exit(0)
-                # break
-            all_sections.extend(sections)
-        except Exception as e:
-            print(f"Error parsing {html_file}: {str(e)}")
-            # exit(0)
-    return all_sections
-def save_to_jsonl(data, output_file):
-    with open(output_file, "w", encoding="utf-8") as f:
-        for item in data:
-            json.dump(item, f, ensure_ascii=False)
-            f.write("\n")
-def main():
-    # html_dir = "transformers_docs_v4.42.0"  # Directory where HTML files are saved
-    # base_url = "https://huggingface.co/docs/transformers/"
-    html_dir = "peft_docs_v0.11.0"  # Directory where HTML files are saved
-    base_url = "https://huggingface.co/docs/peft/"
-    output_file = "hf_peft_v0_11_0.jsonl"
-    all_sections = parse_saved_html_files(html_dir, base_url)
-    save_to_jsonl(all_sections, output_file)
-    print(f"Parsed content saved to {output_file}")
-    print(f"Total sections parsed: {len(all_sections)}")
-if __name__ == "__main__":
-    main()

data/scraping/huggingface_docs/scrape_hf_docs_from_repo.py DELETED Viewed

@@ -1,57 +0,0 @@
-import os
-import requests
-# GitHub repository information
-owner = "huggingface"
-# repo = "peft"
-# path = "docs/source"
-repo = "transformers"
-path = "docs/source/en"
-# GitHub API endpoint for the repository contents
-api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
-def get_files_in_directory(api_url):
-    response = requests.get(api_url)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        print(f"Failed to fetch directory contents: {response.status_code}")
-        return []
-def download_file(file_url, file_path):
-    response = requests.get(file_url)
-    if response.status_code == 200:
-        with open(file_path, "wb") as file:
-            file.write(response.content)
-    else:
-        print(f"Failed to download file: {response.status_code}")
-def fetch_md_files(api_url, local_dir):
-    files = get_files_in_directory(api_url)
-    for file in files:
-        if file["type"] == "file" and file["name"].endswith(".md"):
-            file_url = file["download_url"]
-            file_path = os.path.join(local_dir, file["name"])
-            print(f'Downloading {file["name"]}...')
-            download_file(file_url, file_path)
-        elif file["type"] == "dir":
-            subdir = os.path.join(local_dir, file["name"])
-            os.makedirs(subdir, exist_ok=True)
-            fetch_md_files(file["url"], subdir)
-# Local directory to save the files
-local_dir = f"data/{repo}_docs"
-os.makedirs(local_dir, exist_ok=True)
-# Start fetching files
-fetch_md_files(api_url, local_dir)
-print("All files have been downloaded.")

data/scraping/huggingface_docs/scrape_hf_docs_from_web.py DELETED Viewed

@@ -1,134 +0,0 @@
-import logging
-from pathlib import Path
-from urllib.parse import unquote, urljoin, urlparse
-import scrapy
-from scrapy.crawler import CrawlerProcess
-from tqdm import tqdm
-logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
-def is_valid_url(url, domain, base_path):
-    parsed = urlparse(url)
-    return (
-        parsed.scheme in ["http", "https"]
-        and parsed.netloc == domain
-        and parsed.path.startswith(base_path)
-        and "#" not in url
-    )  # Exclude URLs with fragments
-def clean_url(url):
-    # Replace &amp; with &, and &num; with #
-    url = url.replace("&amp;", "&").replace("&num;", "#")
-    # Decode URL-encoded characters
-    return unquote(url)
-class DocsSpider(scrapy.Spider):
-    name = "docs"
-    def __init__(
-        self,
-        homepage_url: str,
-        domain: str,
-        base_path: str,
-        save_dir="outputs/",
-        target_version=None,
-        *args,
-        **kwargs,
-    ):
-        super(DocsSpider, self).__init__(*args, **kwargs)
-        self.homepage_url = homepage_url
-        self.domain = domain
-        self.base_path = base_path
-        self.allowed_domains = [domain]
-        self.start_urls = [self.homepage_url]
-        self.base_dir = Path(save_dir)
-        self.target_version = target_version
-        self.pages = []
-        self.progress_bar = None
-    def start_requests(self):
-        self.progress_bar = tqdm(desc="Crawling pages", unit="page")
-        yield scrapy.Request(self.homepage_url, self.parse)
-    def parse(self, response):
-        if not is_valid_url(response.url, self.domain, self.base_path):
-            return
-        parsed_uri = urlparse(response.url)
-        relative_path = parsed_uri.path.removeprefix(self.base_path).strip("/")
-        if relative_path:
-            filepath = self.base_dir / relative_path
-        else:
-            filepath = self.base_dir / "index.html"
-        filepath.parent.mkdir(parents=True, exist_ok=True)
-        with open(filepath, "wb") as f:
-            f.write(response.body)
-        self.pages.append({"url": response.url, "html": response.body})
-        # if self.progress_bar:
-        self.progress_bar.update(1)
-        for href in response.css("a::attr(href)").getall():
-            full_url = response.urljoin(clean_url(href))
-            if is_valid_url(full_url, self.domain, self.base_path):
-                if self.target_version:
-                    if self.target_version in full_url:
-                        yield response.follow(full_url, self.parse)
-                else:
-                    yield response.follow(full_url, self.parse)
-    def closed(self, reason):
-        if self.progress_bar:
-            self.progress_bar.close()
-def crawl_docs(start_url, domain, base_path, save_dir="outputs/", target_version=None):
-    process = CrawlerProcess(
-        settings={
-            "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
-            "DOWNLOAD_DELAY": 2,
-            "RANDOMIZE_DOWNLOAD_DELAY": True,
-            "CONCURRENT_REQUESTS": 1,
-            "RETRY_TIMES": 5,
-            "RETRY_HTTP_CODES": [429, 500, 502, 503, 504, 522, 524, 408, 400],
-            "HTTPERROR_ALLOWED_CODES": [404],  # Allow 404 errors to be logged
-        }
-    )
-    process.crawl(
-        DocsSpider,
-        homepage_url=start_url,
-        domain=domain,
-        base_path=base_path,
-        save_dir=save_dir,
-        target_version=target_version,
-    )
-    process.start()
-    spider = next(s for s in process.crawlers if s.spider.name == "docs").spider
-    print(f"Total pages crawled and parsed: {len(spider.pages)}")
-if __name__ == "__main__":
-    # https://huggingface.co/docs/peft/v0.11.0/en/index
-    # Customizable parameters
-    domain = "huggingface.co"
-    version = "v0.11.0"
-    library = "peft"
-    language = "en"
-    # Construct URL and paths
-    base_path = f"/docs/{library}/{version}/{language}"
-    start_url = f"https://{domain}{base_path}/index"
-    save_dir = f"{library}_docs_{version}"
-    # Optional: Set target_version to None if you want to crawl all versions
-    target_version = None
-    crawl_docs(start_url, domain, base_path, save_dir, target_version)

data/scraping/huggingface_docs/validate_jsonl.py DELETED Viewed

@@ -1,51 +0,0 @@
-import json
-from typing import Any, Dict, List
-def load_and_validate_jsonl(file_path: str) -> Dict[int, Any]:
-    """
-    Load a .jsonl file into a dictionary and validate each line.
-    Args:
-    file_path (str): Path to the .jsonl file
-    Returns:
-    Dict[int, Any]: A dictionary where keys are line numbers (1-indexed) and values are the parsed JSON objects
-    Raises:
-    ValueError: If any line in the file is not valid JSON
-    """
-    result = {}
-    with open(file_path, "r") as file:
-        for line_number, line in enumerate(file, 1):
-            try:
-                # Strip whitespace and check if the line is empty
-                stripped_line = line.strip()
-                if not stripped_line:
-                    print(f"Warning: Line {line_number} is empty.")
-                    continue
-                # Attempt to parse the JSON
-                parsed_json = json.loads(stripped_line)
-                result[line_number] = parsed_json
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON on line {line_number}: {e}")
-    return result
-if __name__ == "__main__":
-    file_path = "hf_transformers_v4_42_0.jsonl"
-    try:
-        loaded_data = load_and_validate_jsonl(file_path)
-        print(f"Successfully loaded {len(loaded_data)} valid JSON objects.")
-        # Optional: Print the first few items
-        print("\nFirst few items:")
-        for line_number, data in list(loaded_data.items())[:5]:
-            print(f"Line {line_number}: {data}")
-    except ValueError as e:
-        print(f"Error: {e}")
-    except FileNotFoundError:
-        print(f"Error: File '{file_path}' not found.")

data/scraping_scripts/create_db.ipynb ADDED Viewed

	@@ -0,0 +1,389 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create HF vector database\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv(\"../../.env\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a set of Llama-index Documents with each section in the jsonl file\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Doc ID: 682dbc3b-96ff-4ca4-a556-44d3cd8ffa8a\n",
+      "Text: # Command Line Interfaces (CLIs)  You can use TRL to fine-tune\n",
+      "your Language Model with Supervised Fine-Tuning (SFT) or Direct Policy\n",
+      "Optimization (DPO) or even chat with your model using the TRL CLIs.\n",
+      "Currently supported CLIs are:  - `trl sft`: fine-tune a LLM on a\n",
+      "text/instruction dataset - `trl dpo`: fine-tune a LLM with DPO on a\n",
+      "preference ...\n",
+      "{'url': 'https://huggingface.co/docs/trl/clis/', 'title': 'Command Line Interfaces (CLIs)', 'tokens': 1209, 'retrieve_doc': True, 'source': 'TRL'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_index.core import Document\n",
+    "from llama_index.core.schema import MetadataMode\n",
+    "import json\n",
+    "import pickle\n",
+    "\n",
+    "\n",
+    "def create_docs(input_file):\n",
+    "    with open(input_file, \"r\") as f:\n",
+    "        documents = []\n",
+    "        for i, line in enumerate(f):\n",
+    "            data = json.loads(line)\n",
+    "            documents.append(\n",
+    "                Document(\n",
+    "                    doc_id=data[\"doc_id\"],\n",
+    "                    text=data[\"content\"],\n",
+    "                    metadata={\n",
+    "                        \"url\": data[\"url\"],\n",
+    "                        \"title\": data[\"name\"],\n",
+    "                        \"tokens\": data[\"tokens\"],\n",
+    "                        \"retrieve_doc\": data[\"retrieve_doc\"],\n",
+    "                        \"source\": data[\"source\"],\n",
+    "                    },\n",
+    "                    # LLM will see the 'url' of each chunk\n",
+    "                    excluded_llm_metadata_keys=[\n",
+    "                        # \"url\",\n",
+    "                        \"title\",\n",
+    "                        \"tokens\",\n",
+    "                        \"retrieve_doc\",\n",
+    "                        \"source\",\n",
+    "                    ],\n",
+    "                    # Embedding model will embed the 'title' of each chunk\n",
+    "                    excluded_embed_metadata_keys=[\n",
+    "                        \"url\",\n",
+    "                        # \"title\",\n",
+    "                        \"tokens\",\n",
+    "                        \"retrieve_doc\",\n",
+    "                        \"source\",\n",
+    "                    ],\n",
+    "                )\n",
+    "            )\n",
+    "        return documents\n",
+    "\n",
+    "\n",
+    "# documents = create_docs(\"../transformers_data.jsonl\")\n",
+    "# documents = create_docs(\"../peft_data.jsonl\")\n",
+    "documents = create_docs(\"../trl_data.jsonl\")\n",
+    "# documents = create_docs(\"../llama_index_data.jsonl\")\n",
+    "print(documents[0])\n",
+    "print(documents[0].metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print(\n",
+    "#     \"The LLM sees this: \\n\",\n",
+    "#     documents[0].get_content(metadata_mode=MetadataMode.LLM),\n",
+    "# )\n",
+    "print(\n",
+    "    \"The Embedding model sees this: \\n\",\n",
+    "    documents[0].get_content(metadata_mode=MetadataMode.EMBED),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import chromadb\n",
+    "\n",
+    "# create client and a new collection\n",
+    "DB_COLLECTION = \"chroma-db-trl\"\n",
+    "chroma_client = chromadb.PersistentClient(path=f\"../{DB_COLLECTION}\")\n",
+    "chroma_collection = chroma_client.create_collection(DB_COLLECTION)\n",
+    "\n",
+    "\n",
+    "from llama_index.vector_stores.chroma import ChromaVectorStore\n",
+    "from llama_index.core import StorageContext\n",
+    "\n",
+    "# Define a storage context object using the created vector database.\n",
+    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
+    "\n",
+    "document_dict = {doc.doc_id: doc for doc in documents}\n",
+    "DOCUMENT_NAME = f\"../{DB_COLLECTION}/document_dict_trl.pkl\"\n",
+    "\n",
+    "with open(DOCUMENT_NAME, \"wb\") as f:\n",
+    "    pickle.dump(document_dict, f)\n",
+    "\n",
+    "# with open(DOCUMENT_NAME, \"rb\") as f:\n",
+    "#     document_dict = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/omar/Documents/ai_repos/ai-tutor-rag-system/env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Parsing nodes: 100%|██████████| 33/33 [00:00<00:00, 290.40it/s]\n",
+      "Generating embeddings: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "from llama_index.core.node_parser import SentenceSplitter\n",
+    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
+    "\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents,\n",
+    "    embed_model=OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\"),\n",
+    "    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],\n",
+    "    show_progress=True,\n",
+    "    use_async=True,\n",
+    "    storage_context=storage_context,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test the DB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retriever = index.as_retriever(\n",
+    "    similarity_top_k=10,\n",
+    "    use_async=True,\n",
+    "    embed_model=OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.data_structs import Node\n",
+    "from llama_index.core.schema import NodeWithScore, BaseNode, TextNode\n",
+    "\n",
+    "\n",
+    "# query = \"fine-tune a pretrained model\"\n",
+    "# query = \"fine-tune an llm\"\n",
+    "query = \"how to fine-tune an llm?\"\n",
+    "\n",
+    "nodes_context = []\n",
+    "nodes = retriever.retrieve(query)\n",
+    "\n",
+    "\n",
+    "# Filter nodes with the same ref_doc_id\n",
+    "def filter_nodes_by_unique_doc_id(nodes):\n",
+    "    unique_nodes = {}\n",
+    "    for node in nodes:\n",
+    "        doc_id = node.node.ref_doc_id\n",
+    "        if doc_id is not None and doc_id not in unique_nodes:\n",
+    "            unique_nodes[doc_id] = node\n",
+    "    return list(unique_nodes.values())\n",
+    "\n",
+    "\n",
+    "nodes = filter_nodes_by_unique_doc_id(nodes)\n",
+    "print(len(nodes))\n",
+    "\n",
+    "for node in nodes:\n",
+    "    print(\"Node ID\\t\", node.node_id)\n",
+    "    print(\"Title\\t\", node.metadata[\"title\"])\n",
+    "    print(\"Text\\t\", node.text)\n",
+    "    print(\"Score\\t\", node.score)\n",
+    "    print(\"Metadata\\t\", node.metadata)\n",
+    "    print(\"-_\" * 20)\n",
+    "    if node.metadata[\"retrieve_doc\"] == True:\n",
+    "        print(\"This node will be replaced by the document\")\n",
+    "        doc = document_dict[node.node.ref_doc_id]\n",
+    "        # print(doc.text)\n",
+    "        new_node = NodeWithScore(\n",
+    "            node=TextNode(text=doc.text, metadata=node.metadata), score=node.score\n",
+    "        )\n",
+    "        print(new_node.text)\n",
+    "        nodes_context.append(new_node)\n",
+    "    else:\n",
+    "        nodes_context.append(node)\n",
+    "\n",
+    "print(len(nodes_context))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import ChatPromptTemplate\n",
+    "from llama_index.core.llms import ChatMessage, MessageRole\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "system_prompt = (\n",
+    "    \"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine-tuning models, giving 'memory' to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, Llama-Index, LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context.\"\n",
+    "    \"You are provided information found in Hugging Face's documentation and the RAG course. \"\n",
+    "    \"Only some information might be relevant to the question, so ignore the irrelevant part and use the relevant part to answer the question.\"\n",
+    "    \"Only respond with information given to you documentation. DO NOT use additional information, even if you know the answer. \"\n",
+    "    \"If the answer is somewhere in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpful answers.\"\n",
+    "    \"Here is the information you can use, the order is not important: \\n\\n\"\n",
+    "    \"---------------------\\n\"\n",
+    "    \"{context_str}\\n\"\n",
+    "    \"---------------------\\n\\n\"\n",
+    "    \"REMEMBER:\\n\"\n",
+    "    \"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context.\"\n",
+    "    \"You are provided information found in Hugging Face's documentation and the RAG course. \"\n",
+    "    \"Here are the rules you must follow:\\n\"\n",
+    "    \"* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. \"\n",
+    "    \"* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. \"\n",
+    "    \"* Only use information summarized from the documentation, do not respond otherwise. \"\n",
+    "    \"* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. \"\n",
+    "    \"* Do not reference any links, urls or hyperlinks in your answers.\\n\"\n",
+    "    \"* Make sure to format your answers in Markdown format, including code block and snippets.\\n\"\n",
+    "    \"Now answer the following question: \\n\"\n",
+    ")\n",
+    "\n",
+    "chat_text_qa_msgs: list[ChatMessage] = [\n",
+    "    ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),\n",
+    "    ChatMessage(\n",
+    "        role=MessageRole.USER,\n",
+    "        content=\"{query_str}\",\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "TEXT_QA_TEMPLATE = ChatPromptTemplate(chat_text_qa_msgs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Markdown\n",
+    "from llama_index.core.data_structs import Node\n",
+    "from llama_index.core.schema import NodeWithScore\n",
+    "from llama_index.core import get_response_synthesizer\n",
+    "from llama_index.llms.gemini import Gemini\n",
+    "from llama_index.llms.openai import OpenAI\n",
+    "\n",
+    "# llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=None)\n",
+    "# llm = Gemini(model=\"models/gemini-1.5-pro\", temperature=1, max_tokens=None)\n",
+    "# llm = OpenAI(temperature=1, model=\"gpt-3.5-turbo\", max_tokens=None)\n",
+    "llm = OpenAI(temperature=1, model=\"gpt-4o-mini\", max_tokens=None)\n",
+    "\n",
+    "response_synthesizer = get_response_synthesizer(\n",
+    "    llm=llm, response_mode=\"simple_summarize\", text_qa_template=TEXT_QA_TEMPLATE\n",
+    ")\n",
+    "\n",
+    "response = response_synthesizer.synthesize(query, nodes=nodes_context)\n",
+    "# print(response.response)\n",
+    "display(Markdown(response.response))\n",
+    "\n",
+    "# for src in response.source_nodes:\n",
+    "#     print(src.node.ref_doc_id)\n",
+    "#     print(\"Node ID\\t\", src.node_id)\n",
+    "#     print(\"Title\\t\", src.metadata[\"title\"])\n",
+    "#     print(\"Text\\t\", src.text)\n",
+    "#     print(\"Score\\t\", src.score)\n",
+    "#     print(\"Metadata\\t\", src.metadata)\n",
+    "#     print(\"-_\" * 20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

data/scraping_scripts/create_jsonl_file_hf.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import json
+import os
+import re
+import uuid
+import tiktoken
+BASE_URL = "https://huggingface.co/docs/transformers/"
+# BASE_URL = "https://huggingface.co/docs/peft/"
+# BASE_URL = "https://huggingface.co/docs/trl/"
+# List of directories to include (relative to the main input directory)
+INCLUDED_DIRS = [
+    # Add more directories here as needed
+]
+# List of directories to exclude (relative to the main input directory)
+EXCLUDED_DIRS = [
+    # "some_directory_to_exclude",
+    # Add more directories here as needed
+    "internal",
+    "main_classes",
+]
+# List of specific files to exclude from the root directory
+EXCLUDED_ROOT_FILES = [
+    # "some_file_to_exclude.md",
+    # Add more files here as needed
+]
+# Set this to True to use the INCLUDED_DIRS list, or False to use the EXCLUDED_DIRS list
+USE_INCLUDE_LIST = False
+def extract_title(content):
+    # Try to find a Markdown title (# Title)
+    title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
+    if title_match:
+        return title_match.group(1).strip()
+    # If no Markdown title, use the first non-empty line
+    lines = content.split("\n")
+    for line in lines:
+        if line.strip():
+            return line.strip()
+    # If file is empty, return None
+    return None
+def generate_url(file_path):
+    # Remove the file extension
+    path_without_extension = os.path.splitext(file_path)[0]
+    # Replace backslashes with forward slashes for Windows compatibility
+    path_with_forward_slashes = path_without_extension.replace("\\", "/")
+    # Combine with base URL
+    return BASE_URL + path_with_forward_slashes + "/"
+def should_include_file(file_path):
+    # Check if the file is directly in the root
+    if os.path.dirname(file_path) == "":
+        return os.path.basename(file_path) not in EXCLUDED_ROOT_FILES
+    if USE_INCLUDE_LIST:
+        # Check if the file is in one of the included directories
+        return any(file_path.startswith(dir) for dir in INCLUDED_DIRS)
+    else:
+        # Check if the file is not in any of the excluded directories
+        return not any(file_path.startswith(dir) for dir in EXCLUDED_DIRS)
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(
+        encoding.encode(
+            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
+        )
+    )
+    return num_tokens
+def remove_copyright_header(content):
+    # Pattern to match the copyright header
+    header_pattern = re.compile(r"<!--Copyright.*?-->\s*", re.DOTALL)
+    # Remove the header
+    cleaned_content = header_pattern.sub("", content, count=1)
+    return cleaned_content.strip()
+def process_md_files(directory):
+    jsonl_data = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".md") or file.endswith(".mdx"):
+                file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(file_path, directory)
+                # Only process the file if it should be included
+                if should_include_file(relative_path):
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        content = f.read()
+                    title = extract_title(content)
+                    token_count = num_tokens_from_string(content, "cl100k_base")
+                    if token_count < 100:
+                        continue
+                    cleaned_content = remove_copyright_header(content)
+                    json_object = {
+                        "tokens": token_count,
+                        "doc_id": str(uuid.uuid4()),
+                        "name": (title if title else file),
+                        "url": generate_url(relative_path),
+                        "retrieve_doc": (True if token_count <= 8000 else False),
+                        # "source": "TRL",
+                        # "source": "PEFT",
+                        "source": "HF_Transformers",
+                        "content": cleaned_content,
+                    }
+                    jsonl_data.append(json_object)
+    return jsonl_data
+def save_jsonl(data, output_file):
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in data:
+            json.dump(item, f, ensure_ascii=False)
+            f.write("\n")
+# Directory where the .md files are located
+input_directory = "data/transformers_md_files"
+# input_directory = "data/peft_md_files"
+# input_directory = "data/trl_md_files"
+# Output .jsonl file
+output_file = "data/transformers_data.jsonl"
+# output_file = "data/peft_data.jsonl"
+# output_file = "data/trl_data.jsonl"
+# Process the files and save to JSONL
+jsonl_data = process_md_files(input_directory)
+save_jsonl(jsonl_data, output_file)
+print(f"Processed {len(jsonl_data)} files and saved to {output_file}")

data/scraping_scripts/create_jsonl_file_llama.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import json
+import os
+import re
+import uuid
+import tiktoken
+BASE_URL = "https://docs.llamaindex.ai/en/stable/"
+# List of directories to include (relative to the main input directory)
+INCLUDED_DIRS = [
+    "getting_started",
+    "understanding",
+    "use_cases",
+    "examples",
+    "module_guides",
+    "optimizing",
+]
+# List of specific files to include from the root directory
+INCLUDED_ROOT_FILES = [
+    "index.md",
+    # Add more files here as needed
+]
+def extract_title(content):
+    # Try to find a Markdown title (# Title)
+    title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
+    if title_match:
+        return title_match.group(1).strip()
+    # If no Markdown title, use the first non-empty line
+    lines = content.split("\n")
+    for line in lines:
+        if line.strip():
+            return line.strip()
+    # If file is empty, return None
+    return None
+def generate_url(file_path):
+    # Remove the file extension
+    path_without_extension = os.path.splitext(file_path)[0]
+    # Replace backslashes with forward slashes for Windows compatibility
+    path_with_forward_slashes = path_without_extension.replace("\\", "/")
+    # Combine with base URL
+    return BASE_URL + path_with_forward_slashes + "/"
+def should_include_file(file_path):
+    # Check if the file is directly in the root and in the INCLUDED_ROOT_FILES list
+    if os.path.dirname(file_path) == "":
+        return os.path.basename(file_path) in INCLUDED_ROOT_FILES
+    # Check if the file is in one of the included directories
+    return any(file_path.startswith(dir) for dir in INCLUDED_DIRS)
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(
+        encoding.encode(
+            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
+        )
+    )
+    return num_tokens
+def process_md_files(directory):
+    jsonl_data = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".md") or file.endswith(".mdx"):
+                file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(file_path, directory)
+                # Only process the file if it should be included
+                if should_include_file(relative_path):
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        content = f.read()
+                    title = extract_title(content)
+                    token_count = num_tokens_from_string(content, "cl100k_base")
+                    json_object = {
+                        "tokens": token_count,
+                        "doc_id": str(uuid.uuid4()),
+                        "name": (title if title else file),
+                        "url": generate_url(relative_path),
+                        "retrieve_doc": (True if token_count <= 8000 else False),
+                        "source": "LlamaIndex",
+                        "content": content,
+                    }
+                    jsonl_data.append(json_object)
+    return jsonl_data
+def save_jsonl(data, output_file):
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in data:
+            json.dump(item, f, ensure_ascii=False)
+            f.write("\n")
+# Directory where the .md files are located
+input_directory = "data/llama_index_md_files"
+# Output .jsonl file
+output_file = "data/llama_index_data.jsonl"
+# Process the files and save to JSONL
+jsonl_data = process_md_files(input_directory)
+save_jsonl(jsonl_data, output_file)
+print(f"Processed {len(jsonl_data)} files and saved to {output_file}")

data/scraping_scripts/get_md_files_from_repo.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import json
+import os
+import random
+import time
+import nbformat
+import requests
+from nbconvert import MarkdownExporter
+# GitHub repository information
+owner = "huggingface"
+repo = "transformers"
+path = "docs/source/en"
+# owner = "huggingface"
+# repo = "peft"
+# path = "docs/source"
+# owner = "huggingface"
+# repo = "trl"
+# path = "docs/source"
+# GitHub repository information
+# owner = "run-llama"
+# repo = "llama_index"
+# path = "docs/docs"
+# GitHub API endpoint for the repository contents
+api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+# GitHub Personal Access Token (replace with your own token)
+github_token = "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"
+# Headers for authenticated requests
+headers = {
+    "Authorization": f"token {github_token}",
+    "Accept": "application/vnd.github.v3+json",
+}
+# Maximum number of retries
+MAX_RETRIES = 5
+def check_rate_limit():
+    rate_limit_url = "https://api.github.com/rate_limit"
+    response = requests.get(rate_limit_url, headers=headers)
+    data = response.json()
+    remaining = data["resources"]["core"]["remaining"]
+    reset_time = data["resources"]["core"]["reset"]
+    if remaining < 10:  # Adjust this threshold as needed
+        wait_time = reset_time - time.time()
+        print(f"Rate limit nearly exceeded. Waiting for {wait_time:.2f} seconds.")
+        time.sleep(wait_time + 1)  # Add 1 second buffer
+def get_files_in_directory(api_url, retries=0):
+    try:
+        check_rate_limit()
+        response = requests.get(api_url, headers=headers)
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        if retries < MAX_RETRIES:
+            wait_time = (2**retries) + random.random()
+            print(
+                f"Error fetching directory contents: {e}. Retrying in {wait_time:.2f} seconds..."
+            )
+            time.sleep(wait_time)
+            return get_files_in_directory(api_url, retries + 1)
+        else:
+            print(
+                f"Failed to fetch directory contents after {MAX_RETRIES} retries: {e}"
+            )
+            return []
+def download_file(file_url, file_path, retries=0):
+    try:
+        check_rate_limit()
+        response = requests.get(file_url, headers=headers)
+        response.raise_for_status()
+        with open(file_path, "wb") as file:
+            file.write(response.content)
+    except requests.exceptions.RequestException as e:
+        if retries < MAX_RETRIES:
+            wait_time = (2**retries) + random.random()
+            print(
+                f"Error downloading file: {e}. Retrying in {wait_time:.2f} seconds..."
+            )
+            time.sleep(wait_time)
+            download_file(file_url, file_path, retries + 1)
+        else:
+            print(f"Failed to download file after {MAX_RETRIES} retries: {e}")
+def convert_ipynb_to_md(ipynb_path, md_path):
+    with open(ipynb_path, "r", encoding="utf-8") as f:
+        notebook = nbformat.read(f, as_version=4)
+    exporter = MarkdownExporter()
+    markdown, _ = exporter.from_notebook_node(notebook)
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write(markdown)
+def fetch_files(api_url, local_dir):
+    files = get_files_in_directory(api_url)
+    for file in files:
+        if file["type"] == "file" and file["name"].endswith((".md", ".mdx", ".ipynb")):
+            file_url = file["download_url"]
+            file_name = file["name"]
+            file_path = os.path.join(local_dir, file_name)
+            print(f"Downloading {file_name}...")
+            download_file(file_url, file_path)
+            if file_name.endswith(".ipynb"):
+                md_file_name = file_name.replace(".ipynb", ".md")
+                md_file_path = os.path.join(local_dir, md_file_name)
+                print(f"Converting {file_name} to markdown...")
+                convert_ipynb_to_md(file_path, md_file_path)
+                os.remove(file_path)  # Remove the .ipynb file after conversion
+        elif file["type"] == "dir":
+            subdir = os.path.join(local_dir, file["name"])
+            os.makedirs(subdir, exist_ok=True)
+            fetch_files(file["url"], subdir)
+# Local directory to save the files
+local_dir = f"data/{repo}_md_files"
+os.makedirs(local_dir, exist_ok=True)
+# Start fetching files
+fetch_files(api_url, local_dir)
+print("All files have been downloaded and converted.")

scripts/call_openai.py DELETED Viewed

@@ -1,79 +0,0 @@
-import os
-import logging
-import instructor
-import openai
-from openai import OpenAI, AsyncOpenAI
-from dotenv import load_dotenv
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-load_dotenv(".env")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-def api_function_call(
-    system_message,
-    query: str,
-    model: str = "gpt-4o",
-    response_model=None,
-    max_retries: int = 0,
-    stream: bool = False,
-):
-    client = instructor.patch(OpenAI())
-    try:
-        message_data = {
-            "model": model,
-            "messages": [
-                {"role": "system", "content": system_message},
-                {"role": "user", "content": query},
-            ],
-            "max_retries": max_retries,
-            "stream": stream,
-        }
-        if response_model is not None:
-            message_data["response_model"] = response_model
-        response = client.chat.completions.create(**message_data)
-        error = False
-    except openai.BadRequestError:
-        error = True
-        logger.exception("Invalid request to OpenAI API. See traceback:")
-        error_message = (
-            "Something went wrong while connecting with OpenAI, try again soon!"
-        )
-        return error_message, error
-    except openai.RateLimitError:
-        error = True
-        logger.exception("RateLimit error from OpenAI. See traceback:")
-        error_message = "OpenAI servers seem to be overloaded, try again later!"
-        return error_message, error
-    except Exception as e:
-        error = True
-        logger.exception(
-            "Some kind of error happened trying to generate the response. See traceback:"
-        )
-        error_message = (
-            "Something went wrong with connecting with OpenAI, try again soon!"
-        )
-        return error_message, error
-    if stream is True and response_model is None:
-        def answer_generator():
-            for chunk in response:
-                token = chunk.choices[0].delta.content
-                token = "" if token is None else token
-                yield token
-        return answer_generator(), error
-    else:
-        return response, error

scripts/create_db.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff