romainfd's picture
Working app
6d80262
# LLM
# Ollama for local tests
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import Ollama
# Ref.: https://mistral.ai/news/mixtral-of-experts/#instructed-models
# Q5_K_M quantzation flavor for best quality/recommended tradeoff (memory is no problem here)
# Ref.: https://huggingface.co./TheBloke/Mistral-7B-Instruct-v0.2-GGUF#provided-files
MISTRAL = "mistral:7b-instruct-v0.2-q5_K_M"
# Q4_K quantization flavor for best memory/quality/recommended tradeoff
# Ref.: https://huggingface.co./TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF#provided-files
# mixtral:8x7b-instruct-v0.1-q4_K_M was sadly still too big for my Mac
MIXTRAL = "mixtral:8x7b-instruct-v0.1-q3_K_L"
# Llama2 13B
# Ref.: https://huggingface.co./TheBloke/Llama-2-13B-GGUF
LLAMA2 = "llama2:13b-chat-q5_K_M"
mistral = Ollama(
model=MISTRAL,
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
# Ref.: https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.Ollama.html#langchain_community.llms.ollama.Ollama.format
# format="json"
)
mixtral = Ollama(
model=MIXTRAL,
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
)
llama2 = Ollama(
model=LLAMA2,
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
)
# LOAD
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
FILES = {
'md': [
# "Présentation modes dégradés-20230120_112423-Enregistrement de la réunion.md",
"YouTube - Mode secours telephonie.md"
],
'pdf': [
# "SI-Samu_Fiche procédure_Mode dégradé_Perte de CRRA.pdf",
# "[SI-Samu] Fiche mémo - Procédure Mode dégradé.pdf",
"SI-Samu_Documentation_produit_SF4_J18HF2_20231219 - mode secours seul.pdf",
# "SI-Samu_Documentation_produit_SF4_J18HF2_20231219.pdf"
]
}
def load_data(files):
data = {'md': [], 'pdf': []}
for pdf in files['pdf']:
data['pdf'].extend(PyPDFLoader('resources/' + pdf).load())
for md in files['md']:
data['md'].extend(TextLoader('resources/' + md).load())
return data
def to_full_data(data):
return [
*data['md'],
*data['pdf']
]
# SPLIT
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
def split_MD_then_recursive(data):
# - First use MarkDown title splitter on .MD and then RecursiveSplitter on all
# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
("#", "Titre 1"),
("##", "Titre 2"),
("###", "Titre 3"),
], strip_headers=False)
md_header_splits = data['pdf'].copy()
for md in data['md']:
md_header_splits.extend(markdown_splitter.split_text(md.page_content))
# Char-level splits
text_splitter=RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50 # to improve results quality
)
# Split
return text_splitter.split_documents(md_header_splits)
# EMBED
# Directly done in the different scripts
# RETRIEVE
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever, BM25Retriever, EnsembleRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
# Ensemble is based on weight fusion (Reciprocal Rank Fusion) | Ref.: https://safjan.com/implementing-rank-fusion-in-python/
def get_parent_ensemble_retriever(embeddings, full_data, all_splits, k=4, parent_chunk_size=2000, child_chunk_size=400, collection_name="store"):
# - ParentDocumentRetriever: embed small chunks but retrieve with bigger context
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent (don't make bigger than 512 as most embeddings trunk after that)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)
# The vectorstore to use to index the child chunks
parent_vectorstore = Chroma(
collection_name=collection_name,
embedding_function=embeddings
)
# The storage layer for the parent documents
parent_store = InMemoryStore()
parent_retriever = ParentDocumentRetriever(
vectorstore=parent_vectorstore,
docstore=parent_store,
child_splitter=child_splitter,
parent_splitter=parent_splitter,
search_kwargs={
"k": k,
# "score_threshold": 0.5
},
# search_type="mmr"
)
parent_retriever.add_documents(full_data)
# - EnsembleRetriever
# BM25 logic
bm25_retriever = BM25Retriever.from_texts(
list(map(lambda s: s.page_content, all_splits)),
metadatas=list(map(lambda s: {"retriever": "BM25 sparse similiarity", **s.metadata}, all_splits))
)
bm25_retriever.k = k
# Ensemble of BM25 + vectorstore on parent retriever
return EnsembleRetriever(
retrievers=[parent_retriever, bm25_retriever], weights=[0.5, 0.5]
)
# PROMPT
# Add more context to query + update system prompt to make it speak French
# Ref.: https://stackoverflow.com/questions/76554411/unable-to-pass-prompt-template-to-retrievalqa-in-langchain
# Ref.: https://community.openai.com/t/how-to-prevent-chatgpt-from-answering-questions-that-are-outside-the-scope-of-the-provided-context-in-the-system-role-message/112027/7
from langchain import PromptTemplate
template = """
System: You are helping a user of "bandeau téléphonique SI-SAMU" (a CTI - Computer Telephony Integration - system) during system failure as he needs to use its local backup phone.
Context information is below. Given the context information and not prior knowledge, answer the query.
Language: Answer in French and using "vous".
---
Context: {context}
---
Question: {question}
---
Réponse :
"""
PROMPT = PromptTemplate(template=template, input_variables=['question', 'context'])
# RESULTS
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
def parse_answer(answer):
print(f">> {answer['query']}")
print(f">> {answer['result']}")
print(">> Sources :")
for doc in answer['source_documents']:
page = ''
if 'page' in doc.metadata:
page = f" (page {doc.metadata['page']})"
source = ''
if 'source' in doc.metadata:
source = doc.metadata['source']
titles = ['Titre 1', 'Titre 2', 'Titre 3']
for title in titles:
if title in doc.metadata:
source += f" > {doc.metadata[title]}"
retriever = f"B25" if 'retriever' in doc.metadata else "vectorstore"
print(f">>> {color.BOLD}{source}{page} [{retriever}]{color.END}: {doc.page_content}\n---")
print("--------\n\n")