Spaces:

romainfd
/

iamu-support-secours

Sleeping

App Files Files Community

iamu-support-secours / utils.py

romainfd

Working app

6d80262 12 months ago

raw

history blame contribute delete

7.35 kB

	# LLM
	# Ollama for local tests
	from langchain.llms import Ollama
	from langchain.callbacks.manager import CallbackManager
	from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
	from langchain.llms import Ollama
	# Ref.: https://mistral.ai/news/mixtral-of-experts/#instructed-models
	# Q5_K_M quantzation flavor for best quality/recommended tradeoff (memory is no problem here)
	# Ref.: https://huggingface.co./TheBloke/Mistral-7B-Instruct-v0.2-GGUF#provided-files
	MISTRAL = "mistral:7b-instruct-v0.2-q5_K_M"
	# Q4_K quantization flavor for best memory/quality/recommended tradeoff
	# Ref.: https://huggingface.co./TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF#provided-files
	# mixtral:8x7b-instruct-v0.1-q4_K_M was sadly still too big for my Mac
	MIXTRAL = "mixtral:8x7b-instruct-v0.1-q3_K_L"
	# Llama2 13B
	# Ref.: https://huggingface.co./TheBloke/Llama-2-13B-GGUF
	LLAMA2 = "llama2:13b-chat-q5_K_M"
	mistral = Ollama(
	model=MISTRAL,
	callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
	# Ref.: https://api.python.langchain.com/en/latest/llms/langchain_community.llms.ollama.Ollama.html#langchain_community.llms.ollama.Ollama.format
	# format="json"
	)
	mixtral = Ollama(
	model=MIXTRAL,
	callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
	)
	llama2 = Ollama(
	model=LLAMA2,
	callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
	)


	# LOAD
	from langchain_community.document_loaders import UnstructuredMarkdownLoader
	from langchain_community.document_loaders import TextLoader
	from langchain_community.document_loaders import PyPDFLoader

	FILES = {
	'md': [
	# "Présentation modes dégradés-20230120_112423-Enregistrement de la réunion.md",
	"YouTube - Mode secours telephonie.md"
	],
	'pdf': [
	# "SI-Samu_Fiche procédure_Mode dégradé_Perte de CRRA.pdf",
	# "[SI-Samu] Fiche mémo - Procédure Mode dégradé.pdf",
	"SI-Samu_Documentation_produit_SF4_J18HF2_20231219 - mode secours seul.pdf",
	# "SI-Samu_Documentation_produit_SF4_J18HF2_20231219.pdf"
	]
	}

	def load_data(files):
	data = {'md': [], 'pdf': []}
	for pdf in files['pdf']:
	data['pdf'].extend(PyPDFLoader('resources/' + pdf).load())
	for md in files['md']:
	data['md'].extend(TextLoader('resources/' + md).load())
	return data

	def to_full_data(data):
	return [
	*data['md'],
	*data['pdf']
	]

	# SPLIT
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.text_splitter import MarkdownHeaderTextSplitter

	def split_MD_then_recursive(data):
	# - First use MarkDown title splitter on .MD and then RecursiveSplitter on all
	# MD splits
	markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
	("#", "Titre 1"),
	("##", "Titre 2"),
	("###", "Titre 3"),
	], strip_headers=False)
	md_header_splits = data['pdf'].copy()
	for md in data['md']:
	md_header_splits.extend(markdown_splitter.split_text(md.page_content))

	# Char-level splits
	text_splitter=RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50 # to improve results quality
	)
	# Split
	return text_splitter.split_documents(md_header_splits)

	# EMBED
	# Directly done in the different scripts

	# RETRIEVE
	from langchain.storage import InMemoryStore
	from langchain.retrievers import ParentDocumentRetriever, BM25Retriever, EnsembleRetriever
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import Chroma

	# Ensemble is based on weight fusion (Reciprocal Rank Fusion) \| Ref.: https://safjan.com/implementing-rank-fusion-in-python/
	def get_parent_ensemble_retriever(embeddings, full_data, all_splits, k=4, parent_chunk_size=2000, child_chunk_size=400, collection_name="store"):
	# - ParentDocumentRetriever: embed small chunks but retrieve with bigger context
	# This text splitter is used to create the parent documents
	parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)
	# This text splitter is used to create the child documents
	# It should create documents smaller than the parent (don't make bigger than 512 as most embeddings trunk after that)
	child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)
	# The vectorstore to use to index the child chunks
	parent_vectorstore = Chroma(
	collection_name=collection_name,
	embedding_function=embeddings
	)
	# The storage layer for the parent documents
	parent_store = InMemoryStore()
	parent_retriever = ParentDocumentRetriever(
	vectorstore=parent_vectorstore,
	docstore=parent_store,
	child_splitter=child_splitter,
	parent_splitter=parent_splitter,
	search_kwargs={
	"k": k,
	# "score_threshold": 0.5
	},
	# search_type="mmr"
	)
	parent_retriever.add_documents(full_data)

	# - EnsembleRetriever
	# BM25 logic
	bm25_retriever = BM25Retriever.from_texts(
	list(map(lambda s: s.page_content, all_splits)),
	metadatas=list(map(lambda s: {"retriever": "BM25 sparse similiarity", **s.metadata}, all_splits))

	)
	bm25_retriever.k = k

	# Ensemble of BM25 + vectorstore on parent retriever
	return EnsembleRetriever(
	retrievers=[parent_retriever, bm25_retriever], weights=[0.5, 0.5]
	)

	# PROMPT
	# Add more context to query + update system prompt to make it speak French
	# Ref.: https://stackoverflow.com/questions/76554411/unable-to-pass-prompt-template-to-retrievalqa-in-langchain
	# Ref.: https://community.openai.com/t/how-to-prevent-chatgpt-from-answering-questions-that-are-outside-the-scope-of-the-provided-context-in-the-system-role-message/112027/7
	from langchain import PromptTemplate
	template = """
	System: You are helping a user of "bandeau téléphonique SI-SAMU" (a CTI - Computer Telephony Integration - system) during system failure as he needs to use its local backup phone.
	Context information is below. Given the context information and not prior knowledge, answer the query.
	Language: Answer in French and using "vous".
	---
	Context: {context}
	---
	Question: {question}
	---
	Réponse :
	"""
	PROMPT = PromptTemplate(template=template, input_variables=['question', 'context'])

	# RESULTS
	class color:
	PURPLE = '\033[95m'
	CYAN = '\033[96m'
	DARKCYAN = '\033[36m'
	BLUE = '\033[94m'
	GREEN = '\033[92m'
	YELLOW = '\033[93m'
	RED = '\033[91m'
	BOLD = '\033[1m'
	UNDERLINE = '\033[4m'
	END = '\033[0m'

	def parse_answer(answer):
	print(f">> {answer['query']}")
	print(f">> {answer['result']}")
	print(">> Sources :")
	for doc in answer['source_documents']:
	page = ''
	if 'page' in doc.metadata:
	page = f" (page {doc.metadata['page']})"
	source = ''
	if 'source' in doc.metadata:
	source = doc.metadata['source']
	titles = ['Titre 1', 'Titre 2', 'Titre 3']
	for title in titles:
	if title in doc.metadata:
	source += f" > {doc.metadata[title]}"
	retriever = f"B25" if 'retriever' in doc.metadata else "vectorstore"
	print(f">>> {color.BOLD}{source}{page} [{retriever}]{color.END}: {doc.page_content}\n---")
	print("--------\n\n")