Spaces:

Dharma20
/

UseCaseGenie-GenAI-Usecase-Generator

Running

File size: 9,363 Bytes

9cac175

from setup import *
import tempfile
import requests

from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
from urllib.parse import urlparse
from langchain.docstore.document import Document



def extract_urls(agentstate_result):
  urls = []
  content=[]
  for item in agentstate_result['link_list']:
    urls.append(item['url'])
    content.append(item['content'])
  
  return urls, content



# Function to classify URL based on file extension
def classify_url_by_extension(url):
    """

    Classifies a URL based on its file extension.

    Focuses only on pdf and html, classifying others as unknown.

    """

    if not isinstance(url, str):
        raise ValueError(f"Expected a string, but got {type(url)}")

    # Extract the file extension from the URL
    try:
        file_extension = urlparse(url).path.split('.')[-1].lower()
        if file_extension == 'pdf':
            return 'pdf'
        elif file_extension in ['html', 'htm']:
            return 'html'
        else:
            return 'unknown'
    except Exception as e:
        print(f"Error while parsing URL: {url} - {e}")
        return 'unknown'


# Function to classify based on HTTP Content-Type header (optional, for extra accuracy)
def classify_url_by_header(url):
    """

    Classifies a URL based on the HTTP Content-Type header.

    Focuses only on pdf and html, classifying others as unknown.

    """
    try:
        response = requests.head(url, timeout=5)  # Use HEAD request to fetch headers
        content_type = response.headers.get('Content-Type', '').lower()
        
        if 'pdf' in content_type:
            return 'pdf'
        elif 'html' in content_type:
            return 'html'
        else:
            return 'unknown'
    except requests.RequestException as e:
        print(f"Error while making HEAD request: {url} - {e}")
        return 'unknown'


# Function to classify a list of URLs
def urls_classify_list(urls):
    """

    Classifies a list of URLs into pdf, html, and unknown.

    Returns two separate lists: one for pdf URLs and one for html URLs.

    """
    if not isinstance(urls, list):
        raise ValueError("Expected a list of URLs")

    pdf_urls = []
    html_urls = []

    # Classify each URL
    for url in urls:
        file_type = classify_url_by_extension(url)  # First, try classifying by extension
        if file_type == 'unknown':
            # If extension-based classification failed, fall back to HTTP header classification
            file_type = classify_url_by_header(url)

        if file_type == 'pdf':
            pdf_urls.append(url)
        elif file_type == 'html':
            html_urls.append(url)

    return pdf_urls, html_urls



def urls_classify_list(urls: list):
    pdf_urls=[]
    html_urls=[]
    # Classify the URLs
    for url in urls:
        file_type = classify_url_by_extension(url)  # First, try classifying by extension
        if file_type == 'unknown':
            # If extension-based classification failed, fall back to HTTP header classification
            file_type = classify_url_by_header(url)
        
        if file_type == 'pdf':
            pdf_urls.append(url)
        
        if file_type == 'html' or file_type == 'unknown':
            html_urls.append(url)
        
    return pdf_urls, html_urls



def clean_and_extract_html_data(html_urls, chunk_size=100, chunk_overlap=25):
    """

    Loads HTML content from URLs, cleans the data, and splits it into smaller chunks.



    Args:

        html_urls (list): List of HTML URLs to process.

        chunk_size (int): Maximum size of each chunk.

        chunk_overlap (int): Overlap between chunks.



    Returns:

        list: List of document chunks.

    """

    def clean_content(content):
        """

        Cleans the content by removing unwanted patterns and short lines.

        """
        cleaned_content = content.strip()  # Remove leading/trailing whitespace
        lines = cleaned_content.split('\n')  # Split by newlines
        meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3]  # Keep meaningful lines
        return '\n'.join(meaningful_lines)

    def split_document(doc_content, chunk_size, chunk_overlap):
        """

        Splits a document into smaller chunks with overlap.

        """
        chunks = []
        start = 0
        while start < len(doc_content):
            end = start + chunk_size
            chunk = doc_content[start:end]
            chunks.append(chunk)
            start = end - chunk_overlap if end < len(doc_content) else len(doc_content)
        return chunks

    # Step 1: Load documents from URLs
    docs = []
    for url in html_urls:
        try:
            loader = WebBaseLoader(url)
            data = loader.load()
            docs.extend(data)
        except Exception as e:
            print(f"Error loading URL {url}: {e}")

    # Step 2: Clean the content to remove unwanted data
    cleaned_docs = []
    for doc in docs:
        cleaned_content = clean_content(doc.page_content)
        if cleaned_content:  # Exclude empty documents
            doc.page_content = cleaned_content
            cleaned_docs.append(doc)

    # Step 3: Split the cleaned documents into chunks
    doc_splits = []
    for doc in cleaned_docs:
        chunks = split_document(doc.page_content, chunk_size, chunk_overlap)
        for chunk in chunks:
            doc_splits.append(Document(page_content=chunk, metadata=doc.metadata))

    return doc_splits






# def extract_pdf_from_url(url):
#     """
#     Extract text from a PDF available at a URL.
    
#     Args:
#         url (str): The URL of the PDF file.
        
#     Returns:
#         str: Extracted text from the PDF.
#     """
#     # Step 1: Download the PDF from the URL
#     response = requests.get(url)
#     if response.status_code == 200:
#         pdf_content = response.content
#     else:
#         raise ValueError(f"Failed to fetch the PDF. HTTP Status Code: {response.status_code}")
    
#     # Step 2: Save PDF content to a temporary file
#     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
#         temp_pdf.write(pdf_content)
#         temp_pdf_path = temp_pdf.name  # Get the file path
    
#     # Step 3: Load the PDF using PyPDFLoader
#     loader = PyPDFLoader(temp_pdf_path)
#     documents = loader.load()
    
#     # Step 4: Extract text from all pages
#     extracted_text = "\n".join(doc.page_content for doc in documents)
    
#     return extracted_text


# def clean_and_split_pdf_text(pdf_text, chunk_size=100, chunk_overlap=25):
#     """
#     Cleans and splits the extracted PDF text into smaller chunks.
    
#     Args:
#         pdf_text (str): Extracted text from a PDF.
#         chunk_size (int): Maximum size of each chunk.
#         chunk_overlap (int): Overlap between chunks.
        
#     Returns:
#         list: List of document chunks.
#     """
#     def clean_content(content):
#         """
#         Cleans the text by removing unwanted patterns and short lines.
#         """
#         content = content.strip()  # Remove leading/trailing whitespace
#         lines = content.split('\n')  # Split into lines
#         meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3]  # Exclude short lines
#         return '\n'.join(meaningful_lines)

#     def split_text(content, chunk_size, chunk_overlap):
#         """
#         Splits cleaned text into smaller chunks with overlap.
#         """
#         chunks = []
#         start = 0
#         while start < len(content):
#             end = start + chunk_size
#             chunks.append(content[start:end])
#             start = end - chunk_overlap if end < len(content) else len(content)
#         return chunks

#     # Step 1: Clean the text
#     cleaned_text = clean_content(pdf_text)

#     # Step 2: Split the cleaned text
#     return split_text(cleaned_text, chunk_size, chunk_overlap)


# def pdf_extraction(pdf_urls, chunk_size=100, chunk_overlap=25):
#     """
#     Extracts and processes text from a list of PDF URLs.
    
#     Args:
#         pdf_urls (list): List of PDF URLs.
#         chunk_size (int): Maximum size of each chunk.
#         chunk_overlap (int): Overlap between chunks.
        
#     Returns:
#         list: List of Document objects containing split text.
#     """
#     all_chunks = []
    
#     for pdf_url in pdf_urls:
#         try:
#             # Extract text from the PDF
#             extracted_text = extract_pdf_from_url(pdf_url)
            
#             # Clean and split the text
#             chunks = clean_and_split_pdf_text(extracted_text, chunk_size, chunk_overlap)
            
#             # Convert chunks into Document objects
#             for chunk in chunks:
#                 all_chunks.append(Document(page_content=chunk, metadata={"source": pdf_url}))
#         except Exception as e:
#             print(f"Error processing PDF URL {pdf_url}: {e}")
    
#     return all_chunks