Spaces:
Sleeping
Sleeping
# LLM | |
# Ollama for local tests | |
from langchain.llms import Ollama | |
from langchain.callbacks.manager import CallbackManager | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from langchain.llms import Ollama | |
# Ref.: https://mistral.ai/news/mixtral-of-experts/#instructed-models | |
# Q5_K_M quantzation flavor for best quality/recommended tradeoff (memory is no problem here) | |
# Ref.: https://huggingface.co./TheBloke/Mistral-7B-Instruct-v0.2-GGUF#provided-files | |
MISTRAL = "mistral:7b-instruct-v0.2-q5_K_M" | |
# Q4_K quantization flavor for best memory/quality/recommended tradeoff | |
# Ref.: https://huggingface.co./TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF#provided-files | |
# mixtral:8x7b-instruct-v0.1-q4_K_M was sadly still too big for my Mac | |
MIXTRAL = "mixtral:8x7b-instruct-v0.1-q3_K_L" | |
# Llama2 13B | |
# Ref.: https://huggingface.co./TheBloke/Llama-2-13B-GGUF | |
LLAMA2 = "llama2:13b-chat-q5_K_M" | |
mistral = Ollama( | |
model=MISTRAL, | |
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), | |
# Ref.: https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.Ollama.html#langchain_community.llms.ollama.Ollama.format | |
# format="json" | |
) | |
mixtral = Ollama( | |
model=MIXTRAL, | |
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) | |
) | |
llama2 = Ollama( | |
model=LLAMA2, | |
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]) | |
) | |
# LOAD | |
from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
from langchain_community.document_loaders import TextLoader | |
from langchain_community.document_loaders import PyPDFLoader | |
FILES = { | |
'md': [ | |
# "Présentation modes dégradés-20230120_112423-Enregistrement de la réunion.md", | |
"YouTube - Mode secours telephonie.md" | |
], | |
'pdf': [ | |
# "SI-Samu_Fiche procédure_Mode dégradé_Perte de CRRA.pdf", | |
# "[SI-Samu] Fiche mémo - Procédure Mode dégradé.pdf", | |
"SI-Samu_Documentation_produit_SF4_J18HF2_20231219 - mode secours seul.pdf", | |
# "SI-Samu_Documentation_produit_SF4_J18HF2_20231219.pdf" | |
] | |
} | |
def load_data(files): | |
data = {'md': [], 'pdf': []} | |
for pdf in files['pdf']: | |
data['pdf'].extend(PyPDFLoader('resources/' + pdf).load()) | |
for md in files['md']: | |
data['md'].extend(TextLoader('resources/' + md).load()) | |
return data | |
def to_full_data(data): | |
return [ | |
*data['md'], | |
*data['pdf'] | |
] | |
# SPLIT | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.text_splitter import MarkdownHeaderTextSplitter | |
def split_MD_then_recursive(data): | |
# - First use MarkDown title splitter on .MD and then RecursiveSplitter on all | |
# MD splits | |
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[ | |
("#", "Titre 1"), | |
("##", "Titre 2"), | |
("###", "Titre 3"), | |
], strip_headers=False) | |
md_header_splits = data['pdf'].copy() | |
for md in data['md']: | |
md_header_splits.extend(markdown_splitter.split_text(md.page_content)) | |
# Char-level splits | |
text_splitter=RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=50 # to improve results quality | |
) | |
# Split | |
return text_splitter.split_documents(md_header_splits) | |
# EMBED | |
# Directly done in the different scripts | |
# RETRIEVE | |
from langchain.storage import InMemoryStore | |
from langchain.retrievers import ParentDocumentRetriever, BM25Retriever, EnsembleRetriever | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
# Ensemble is based on weight fusion (Reciprocal Rank Fusion) | Ref.: https://safjan.com/implementing-rank-fusion-in-python/ | |
def get_parent_ensemble_retriever(embeddings, full_data, all_splits, k=4, parent_chunk_size=2000, child_chunk_size=400, collection_name="store"): | |
# - ParentDocumentRetriever: embed small chunks but retrieve with bigger context | |
# This text splitter is used to create the parent documents | |
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size) | |
# This text splitter is used to create the child documents | |
# It should create documents smaller than the parent (don't make bigger than 512 as most embeddings trunk after that) | |
child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size) | |
# The vectorstore to use to index the child chunks | |
parent_vectorstore = Chroma( | |
collection_name=collection_name, | |
embedding_function=embeddings | |
) | |
# The storage layer for the parent documents | |
parent_store = InMemoryStore() | |
parent_retriever = ParentDocumentRetriever( | |
vectorstore=parent_vectorstore, | |
docstore=parent_store, | |
child_splitter=child_splitter, | |
parent_splitter=parent_splitter, | |
search_kwargs={ | |
"k": k, | |
# "score_threshold": 0.5 | |
}, | |
# search_type="mmr" | |
) | |
parent_retriever.add_documents(full_data) | |
# - EnsembleRetriever | |
# BM25 logic | |
bm25_retriever = BM25Retriever.from_texts( | |
list(map(lambda s: s.page_content, all_splits)), | |
metadatas=list(map(lambda s: {"retriever": "BM25 sparse similiarity", **s.metadata}, all_splits)) | |
) | |
bm25_retriever.k = k | |
# Ensemble of BM25 + vectorstore on parent retriever | |
return EnsembleRetriever( | |
retrievers=[parent_retriever, bm25_retriever], weights=[0.5, 0.5] | |
) | |
# PROMPT | |
# Add more context to query + update system prompt to make it speak French | |
# Ref.: https://stackoverflow.com/questions/76554411/unable-to-pass-prompt-template-to-retrievalqa-in-langchain | |
# Ref.: https://community.openai.com/t/how-to-prevent-chatgpt-from-answering-questions-that-are-outside-the-scope-of-the-provided-context-in-the-system-role-message/112027/7 | |
from langchain import PromptTemplate | |
template = """ | |
System: You are helping a user of "bandeau téléphonique SI-SAMU" (a CTI - Computer Telephony Integration - system) during system failure as he needs to use its local backup phone. | |
Context information is below. Given the context information and not prior knowledge, answer the query. | |
Language: Answer in French and using "vous". | |
--- | |
Context: {context} | |
--- | |
Question: {question} | |
--- | |
Réponse : | |
""" | |
PROMPT = PromptTemplate(template=template, input_variables=['question', 'context']) | |
# RESULTS | |
class color: | |
PURPLE = '\033[95m' | |
CYAN = '\033[96m' | |
DARKCYAN = '\033[36m' | |
BLUE = '\033[94m' | |
GREEN = '\033[92m' | |
YELLOW = '\033[93m' | |
RED = '\033[91m' | |
BOLD = '\033[1m' | |
UNDERLINE = '\033[4m' | |
END = '\033[0m' | |
def parse_answer(answer): | |
print(f">> {answer['query']}") | |
print(f">> {answer['result']}") | |
print(">> Sources :") | |
for doc in answer['source_documents']: | |
page = '' | |
if 'page' in doc.metadata: | |
page = f" (page {doc.metadata['page']})" | |
source = '' | |
if 'source' in doc.metadata: | |
source = doc.metadata['source'] | |
titles = ['Titre 1', 'Titre 2', 'Titre 3'] | |
for title in titles: | |
if title in doc.metadata: | |
source += f" > {doc.metadata[title]}" | |
retriever = f"B25" if 'retriever' in doc.metadata else "vectorstore" | |
print(f">>> {color.BOLD}{source}{page} [{retriever}]{color.END}: {doc.page_content}\n---") | |
print("--------\n\n") | |