Spaces:

Didier
/

Docs_QA_ColBERT_DSPy

Running

App Files Files Community

Didier Guillevic commited on Dec 22, 2024

Commit

1c18375

1 Parent(s): 3d4db34

Initial commit

Browse files

Files changed (5) hide show

app.py +118 -0
colbert_utils.py +44 -0
dspy_utils.py +87 -0
pdf_utils.py +180 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+""" app.py
+Question / answer over a collection of PDF documents using late interaction
+ColBERT model for retrieval and DSPy+Mistral for answer generation.
+:author: Didier Guillevic
+:date: 2024-12-22
+"""
+import gradio as gr
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+import os
+import pdf_utils # utilities for pdf processing
+import colbert_utils # utilities for to build a ColBERT retrieval model
+import dspy_utils # utilities for building a DSPy based retrieval generation model
+from tqdm.notebook import tqdm
+import warnings
+warnings.filterwarnings('ignore')
+def generate_response(question: str) -> list[str, str, str]:
+    """Generate a response to a given question using the RAG model.
+    """
+    global dspy_rag_model
+    if dspy_rag_model is None:
+        return "RAG model not built. Please build the model first."
+    # Generate response
+    responses, references, snippets = dspy_rag_model.generate_response(
+        question=question, k=5, method='chain_of_thought')
+    return responses, references, snippets
+with gr.Blocks() as demo:
+    gr.Markdown("""
+        # Retrieval (ColBERT) + Generation (DSPy & Mistral)
+        Note: building the retrieval model might take a few minutes.
+    """)
+    # Input files and build status
+    with gr.Row():
+        upload_files = gr.File(
+            label="Upload PDF files to index", file_count="multiple",
+            value=["OECD_Engaging_with_HNW_individuals_tax_compliance_(2009).pdf",],
+            scale=5)
+        build_status = gr.Textbox(label="Build status", placeholder="", scale=2)
+    # button
+    build_button = gr.Button("Build retrieval generation model", variant='primary')
+    # Question to answer
+    question = gr.Textbox(
+        label="Question to answer",
+        placeholder="How do tax administrations address aggressive tax planning by HNWIs?"
+    )
+    response = gr.Textbox(
+        label="Response",
+        placeholder=""
+    )
+    with gr.Accordion("References & snippets", open=False):
+        references = gr.HTML(label="References")
+        snippets = gr.HTML(label="Snippets")
+    # button
+    response_button = gr.Button("Submit", variant='primary')
+    # Example questions given default provided PDF file
+    with gr.Accordion("Sample questions", open=False):
+        gr.Examples(
+            [
+                ["What are the tax risks associated with high net worth individuals (HNWIs)?",],
+                ["How do tax administrations address aggressive tax planning by HNWIs?",],
+                ["How can tax administrations engage with HNWIs to improve tax compliance?",],
+                ["What are the benefits of establishing dedicated HNWI units within tax administrations?",],
+                ["How can international cooperation help address offshore tax risks associated with HNWIs?",],
+            ],
+            inputs=[question,],
+            outputs=[response, references, snippets],
+            fn=generate_response,
+            cache_examples=False,
+            label="Sample questions"
+        )
+    # Documentation
+    with gr.Accordion("Documentation", open=False):
+        gr.Markdown("""
+            - What
+                - Retrieval augmented generation (RAG) model based on ColBERT and DSPy.
+                - Retrieval base model:  'antoinelouis/colbert-xm' (multilingual model)
+                - Generation framework: DSPy and Mistral.
+            - How
+                - Upload PDF files to index.
+                - Build the retrieval augmented model (might take a few minutes)
+                - Ask a question to generate a response.
+        """)
+    # Click actions
+    build_button.click(
+        fn=build_rag_model,
+        inputs=[upload_files],
+        outputs=[build_status]
+    )
+    response_button.click(
+        fn=generate_response,
+        inputs=[question],
+        outputs=[response, references, snippets]
+    )
+demo.launch(show_api=False)

colbert_utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+""" colbert_utils.py
+Utilities for building (and using) a ColBERT (retrieval) model.
+:author: Didier Guillevic
+:email: [email protected]
+:creation: 2024-12-21
+"""
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from ragatouille import RAGPretrainedModel
+def build_colbert_model(
+        documents: list[str],
+        metadatas: list[dict[str, str]],
+        pretrained_model: str='antoinelouis/colbert-xm',
+        index_name: str='colbert_index'
+    ) -> RAGPretrainedModel:
+    """Build a ColBERT model for retrieval.
+    Args:
+        documents: list of documents to index
+        metadatas: list of metadata for each document
+        index_name: name of the index built with given documents
+        pretrined_model: name of the pretrained model to use
+    Returns:
+        the ColBERT retrieval model built witt the given documents.
+    """
+    model = RAGPretrainedModel.from_pretrained(pretrained_model)
+    model.index(
+        collection=documents,
+        #document_ids=document_ids, # no unique IDs at the moment
+        document_metadatas=metadatas,
+        index_name=index_name,
+        max_document_length=180,
+        split_documents=True,
+        use_faiss=False # cannot get it to work...
+    )
+    return model

dspy_utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+""" dspy_utils.py
+Utilities for building a DSPy based retrieval (augmented) generation model.
+:author: Didier Guillevic
+:email: [email protected]
+:creation: 2024-12-21
+"""
+import os
+import dspy
+from ragatouille import RAGPretrainedModel
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class DSPyRagModel:
+    def __init__(self, retrieval_model: RAGPretrainedModel):
+        # Init the retrieval and language model
+        self.retrieval_model = retrieval_model
+        self.language_model = dspy.LM(model="mistral/mistral-large-latest", api_key=os.environ["MISTRAL_API_KEY"])
+        # Set dspy retrieval and language model
+        dspy.settings.configure(
+            lm=self.language_model,
+            rm=self.retrieval_model
+        )
+        # Set dspy prediction functions
+        class BasicQA(dspy.Signature):
+            """Answer the question given the context provided"""
+            context = dspy.InputField(desc="may contain relevant facts")
+            question = dspy.InputField()
+            answer = dspy.OutputField(desc="Answer the given question.")
+        self.predict = dspy.Predict(BasicQA, temperature=0.01)
+        self.predict_chain_of_thought = dspy.ChainOfThought(BasicQA)
+    def generate_response(
+            self,
+            question: str,
+            k: int=3,
+            method: str = 'chain_of_thought'
+        ) -> tuple[str, str, str]:
+        """Generate a response to a given question using the specified method.
+        Args:
+            question: the question to answer
+            k: number of passages to retrieve
+            method: method for generating the response: ['simple', 'chain_of_thought']
+        Returns:
+            - the generated answer
+            - (html string): the references (origin of the snippets of text used to generate the answer)
+            - (html string): the snippets of text used to generate the answer
+        """
+        # Retrieval
+        retrieval_results = self.retrieval_model.search(query=question, k=k)
+        passages = [res.get('content') for res in retrieval_results]
+        metadatas = [res.get('document_metadata') for res in retrieval_results]
+        # Generate response given retrieved passages
+        if method == 'simple':
+            response = self.predict(context=passages, question=question).answer
+        elif method == 'chain_of_thought':
+            response = self.predict_chain_of_thought(context=passages, question=question).answer
+        else:
+            raise ValueError(f"Unknown method: {method}. Expected ['simple', 'chain_of_thought']")
+        # Create an HTML string with the references
+        references = "<h4>References</h4>\n" + create_bulleted_list(metadatas)
+        snippets = "<h4>Snippets</h4>\n" + create_bulleted_list(passages)
+        return response, references, snippets
+def create_bulleted_list(texts: list[str]) -> str:
+    """
+    This function takes a list of strings and returns HTML with a bulleted list.
+    """
+    html_items = []
+    for item in texts:
+        html_items.append(f"<li>{item}</li>")
+    return "<ul>" + "".join(html_items) + "</ul>"

pdf_utils.py ADDED Viewed

	@@ -0,0 +1,180 @@

+""" pdf_utils.py
+Utilities for working with PDFs
+:author: Didier Guillevic
+:email: [email protected]
+:creation: 2024-12-21
+"""
+import pypdf
+import os
+import datetime
+import pytz
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def validate_pdf(file_path: str) -> bool:
+    """Validate that file exists AND is a PDF file)
+    """
+    if not os.path.exists(file_path):
+        logger.error(f"File not found at path: {file_path}")
+        return False
+    if not file_path.lower().endswith('.pdf'):
+        logger.error("File is not a PDF")
+        return False
+    return True
+def get_text_from_pdf(
+        file_path: str,
+        max_chars: int = 100_000_000
+    ) -> str:
+    """Extract the text from a given PDF file.
+    Args:
+        file_path: path to the PDF file
+        mac_chars: max length (in chars) to be read from the file
+    Returns:
+        the extracted text.
+    """
+    if not validate_pdf(file_path):
+        return None
+    try:
+        with open(file_path, 'rb') as file:
+            # Create PDF reader object
+            pdf_reader = pypdf.PdfReader(file)
+            # Get total number of pages
+            num_pages = len(pdf_reader.pages)
+            print(f"Processing PDF with {num_pages} pages...")
+            extracted_text = []
+            total_chars = 0
+            # Iterate through all pages
+            for page_num in range(num_pages):
+                # Extract text from page
+                page = pdf_reader.pages[page_num]
+                text = page.extract_text()
+                # Check if adding this page's text would exceed the limit
+                if total_chars + len(text) > max_chars:
+                    # Only add text up to the limit
+                    remaining_chars = max_chars - total_chars
+                    extracted_text.append(text[:remaining_chars])
+                    print(f"Reached {max_chars} character limit at page {page_num + 1}")
+                    break
+                extracted_text.append(text)
+                total_chars += len(text)
+                print(f"Processed page {page_num + 1}/{num_pages}")
+            final_text = '\n'.join(extracted_text)
+            print(f"\nExtraction complete! Total characters: {len(final_text)}")
+            return final_text
+    except pypdf.PdfReadError:
+        print("Error: Invalid or corrupted PDF file")
+        return None
+    except Exception as e:
+        print(f"An unexpected error occurred: {str(e)}")
+        return None
+def get_pdf_metadata(file_path: str) -> dict:
+    """Get the metadata of a given PDF file.
+    Args:
+        file_path: path to a PDF file
+    Returns:
+        dictionary woth the metadata information
+    """
+    if not validate_pdf(file_path):
+        return None
+    try:
+        with open(file_path, 'rb') as file:
+            pdf_reader = pypdf.PdfReader(file)
+            metadata = {
+                'num_pages': len(pdf_reader.pages),
+                'metadata': pdf_reader.metadata
+            }
+            return metadata
+    except Exception as e:
+        print(f"Error extracting metadata: {str(e)}")
+        return None
+def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str:
+    """Extract a datetime string from the metadata of a PDF file.
+    Args:
+        metadata: dictionary with the metadata information
+        key: key to extract the datetime from
+    Returns:
+        the datetime string or None if not found
+    """
+    if key not in metadata:
+        return None
+    # Extract the datetime string from data time string used in PDF metadata
+    # e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00"
+    pdf_date_string = metadata[key]
+    # Remove the 'D:' prefix and the single quotes around the timezone offset
+    date_string = pdf_date_string[2:]
+    date_string = date_string.replace("'", "")
+    # Parse the date and time components
+    date_part = date_string[:8]
+    time_part = date_string[8:14]
+    offset_part = date_string[14:]
+    # Create a datetime object
+    dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
+    # Handle the timezone offset
+    offset_hours = int(offset_part[1:3])
+    offset_minutes = int(offset_part[3:5])
+    offset = offset_hours * 60 + offset_minutes
+    if offset_part.startswith('+'):
+        offset = -offset
+    # Create a timezone object
+    timezone = pytz.FixedOffset(offset)
+    # Create a timezone-aware datetime object
+    dt = timezone.localize(dt)
+    return dt.strftime("%Y-%m-%d %H:%M:%S")
+def get_metadata_info(pdf_path: str) -> dict:
+    """Build a dictionary with basic and additional information about a PDF file.
+    Args:
+        pdf_path: path to the PDF file
+    Returns:
+        dictionary with the metadata information
+    """
+    # basic information about the file
+    metadata_info = {}
+    metadata_info['file_name'] = os.path.basename(pdf_path)
+    # additional information about the file
+    pdf_metadata = get_pdf_metadata(pdf_path)
+    if pdf_metadata:
+        metadata_info['num_pages'] = pdf_metadata['num_pages']
+        metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate')
+        metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate')
+    return metadata_info

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+RAGatouille
+dspy-ai
+mistralai
+litellm
+pypdf
+pytz