|
from __future__ import annotations |
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type |
|
import logging |
|
import json |
|
import os |
|
from datetime import datetime |
|
import hashlib |
|
import csv |
|
import requests |
|
import re |
|
import html |
|
import markdown2 |
|
import torch |
|
import sys |
|
import gc |
|
from pygments.lexers import guess_lexer, ClassNotFound |
|
import time |
|
import json |
|
import operator |
|
from typing import Annotated, Sequence, TypedDict |
|
import pprint |
|
|
|
import gradio as gr |
|
from pypinyin import lazy_pinyin |
|
import tiktoken |
|
import mdtex2html |
|
from markdown import markdown |
|
from pygments import highlight |
|
from pygments.lexers import guess_lexer,get_lexer_by_name |
|
from pygments.formatters import HtmlFormatter |
|
|
|
from langchain.chains import LLMChain, RetrievalQA |
|
from langgraph.graph import END, StateGraph |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.document_loaders import PyPDFLoader, WebBaseLoader, UnstructuredWordDocumentLoader, DirectoryLoader |
|
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader |
|
from langchain.document_loaders.generic import GenericLoader |
|
from langchain.document_loaders.parsers import OpenAIWhisperParser |
|
from langchain.schema import AIMessage, HumanMessage |
|
from langchain.llms import HuggingFaceHub |
|
from langchain.llms import HuggingFaceTextGenInference |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings |
|
from langchain.tools import DuckDuckGoSearchRun |
|
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever |
|
from typing import Dict, TypedDict |
|
from langchain_core.messages import BaseMessage |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.prompts import PromptTemplate |
|
|
|
from langchain import hub |
|
from langchain.output_parsers.openai_tools import PydanticToolsParser |
|
from langchain.prompts import PromptTemplate |
|
from langchain.schema import Document |
|
from langchain_community.tools.tavily_search import TavilySearchResults |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_core.messages import BaseMessage, FunctionMessage |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.pydantic_v1 import BaseModel, Field |
|
from langchain_core.runnables import RunnablePassthrough |
|
from langchain_core.utils.function_calling import convert_to_openai_tool |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from chromadb.errors import InvalidDimensionException |
|
import io |
|
from PIL import Image, ImageDraw, ImageOps, ImageFont |
|
import base64 |
|
from tempfile import NamedTemporaryFile |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
nltk.download('punkt') |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
from reportlab.lib.pagesizes import inch, A4 |
|
from reportlab.platypus import SimpleDocTemplate, Frame, Spacer |
|
from reportlab.lib import colors |
|
from reportlab.lib.units import mm |
|
from reportlab.platypus import Paragraph, SimpleDocTemplate, Frame, Image, Table, ListFlowable, ListItem |
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
|
from reportlab.lib.units import cm |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
ANTWORT_WEISS_NICHT = ["ich weiß nicht.", "ich weiß das nicht", "Ich habe dazu keine Antwort", "Ich bin nicht sicher", "Ich kann das nicht beantworten", "Es tut mir leid, aber ich kenne keinen", "Es tut mir leid, aber ich kann die Frage nicht beantworten.", "Es tut mir leid, aber ich kann die Frage nicht beantworten, da ich zu der Frage keine spezifischen Informatioen habe"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
template = """\Antworte in deutsch, wenn es nicht explizit anders gefordert wird. Wenn du die Antwort nicht kennst, antworte direkt, dass du es nicht weißt. |
|
Versuche nicht es zu umschreiben. Versuche nicht, die Antwort zu erfinden oder aufzumocken. Halte die Antwort kurz aber ausführlich genug und exakt.""" |
|
|
|
llm_template = "Beantworte die Frage am Ende. " + template + "Frage: {question} " |
|
|
|
llm_template2 = "Fasse folgenden Text als Überschrift mit maximal 3 Worten zusammen. Text: {question} " |
|
|
|
rag_template = "Nutze die folgenden Kontext (Beginnend mit dem Wort 'Kontext:') aus Teilen aus den angehängten Dokumenten, um die Frage (Beginnend mit dem Wort 'Frage: ') am Ende zu beantworten. Wenn du die Frage aus dem folgenden Kontext nicht beantworten kannst, dann versuche eine Beantwortung aus deinen eigenen trainierten Daten zu finden. Mache das kenntlich, ob du dich auf den hier angehängten Kontext beziehst oder ob du anhand deiner Daten antwortest." + template + "Kontext: {context} Frage: {question}" |
|
|
|
|
|
|
|
LLM_CHAIN_PROMPT = PromptTemplate(input_variables = ["question"], |
|
template = llm_template) |
|
|
|
LLM_CHAIN_PROMPT2 = PromptTemplate(input_variables = ["question"], |
|
template = llm_template2) |
|
|
|
RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"], |
|
template = rag_template) |
|
|
|
|
|
|
|
|
|
PATH_WORK = "." |
|
CHROMA_DIR = "/chroma/kkg" |
|
CHROMA_PDF = './chroma/kkg/pdf' |
|
CHROMA_WORD = './chroma/kkg/word' |
|
CHROMA_EXCEL = './chroma/kkg/excel' |
|
YOUTUBE_DIR = "/youtube" |
|
HISTORY_PFAD = "/data/history" |
|
|
|
|
|
|
|
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf" |
|
WEB_URL = "https://openai.com/research/gpt-4" |
|
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE" |
|
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE" |
|
|
|
|
|
urls = [ |
|
"https://kkg.hamburg.de/unser-leitbild/" |
|
"https://kkg.hamburg.de/unsere-schulcharta/", |
|
"https://kkg.hamburg.de/koordination-unterrichtsentwicklung/", |
|
"https://kkg.hamburg.de/konzept-medien-und-it-am-kkg/", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
def is_response_similar(response, threshold=0.7): |
|
if (len(response) < 160): |
|
|
|
combined_responses = ANTWORT_WEISS_NICHT + [response] |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform(combined_responses) |
|
|
|
|
|
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]) |
|
|
|
|
|
if np.max(cosine_similarities) > threshold: |
|
return True |
|
return False |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def normalise_prompt (prompt): |
|
|
|
prompt_klein =prompt.lower() |
|
|
|
tokens = word_tokenize(prompt_klein) |
|
|
|
tokens = [word for word in tokens if word.isalnum()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens = [re.sub(r'\W+', '', word) for word in tokens] |
|
|
|
|
|
|
|
|
|
|
|
normalized_prompt = ' '.join(tokens) |
|
print("normaiserd prompt..................................") |
|
print(normalized_prompt) |
|
return normalized_prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_directory_loader(file_type, directory_path): |
|
|
|
loaders = { |
|
'.pdf': PyPDFLoader, |
|
'.word': UnstructuredWordDocumentLoader, |
|
} |
|
return DirectoryLoader( |
|
path=directory_path, |
|
glob=f"**/*{file_type}", |
|
loader_cls=loaders[file_type], |
|
) |
|
|
|
|
|
def document_loading_splitting(): |
|
|
|
|
|
docs = [] |
|
|
|
|
|
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF) |
|
word_loader = create_directory_loader('.word', CHROMA_WORD) |
|
|
|
|
|
pdf_documents = pdf_loader.load() |
|
word_documents = word_loader.load() |
|
|
|
|
|
docs_web = [WebBaseLoader(url).load() for url in urls] |
|
docs_list = [item for sublist in docs_web for item in sublist] |
|
|
|
|
|
|
|
pdf_list = [pdf_documents] |
|
word_list = [word_documents] |
|
|
|
|
|
for doc in pdf_list: |
|
docs_list.extend(doc) |
|
for doc in word_list: |
|
docs_list.extend(doc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap=250) |
|
doc_splits = text_splitter.split_documents(docs_list) |
|
|
|
|
|
|
|
return doc_splits |
|
|
|
|
|
|
|
def document_storage_chroma(splits): |
|
|
|
vectorstore = Chroma.from_documents(documents=splits,collection_name="rag-chroma",embedding=OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR) |
|
retriever = vectorstore.as_retriever(search_kwargs = {"k": 4}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
return vectorstore, retriever |
|
|
|
|
|
""" |
|
#Mongo DB die splits ablegen - vektorisiert... |
|
def document_storage_mongodb(splits): |
|
MongoDBAtlasVectorSearch.from_documents(documents = splits, |
|
embedding = OpenAIEmbeddings(disallowed_special = ()), |
|
collection = MONGODB_COLLECTION, |
|
index_name = MONGODB_INDEX_NAME) |
|
############################################ |
|
#dokumente in chroma db vektorisiert ablegen können - die Db vorbereiten daüfur |
|
def document_retrieval_chroma(llm, prompt): |
|
#OpenAI embeddings ------------------------------- |
|
embeddings = OpenAIEmbeddings() |
|
|
|
#HF embeddings ----------------------------------- |
|
#Alternative Embedding - für Vektorstore, um Ähnlichkeitsvektoren zu erzeugen - die ...InstructEmbedding ist sehr rechenaufwendig |
|
#embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}) |
|
#etwas weniger rechenaufwendig: |
|
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}) |
|
|
|
#ChromaDb um die embedings zu speichern |
|
db = Chroma(embedding_function = embeddings, persist_directory = PATH_WORK + CHROMA_DIR) |
|
return db |
|
|
|
############################################ |
|
#dokumente in chroma db vektorisiert ablegen können - die Db vorbereiten daüfur |
|
#zweite Variante, passend zu rag_chain2 für generate_text_mit_bild- ohne llm vorher festlegen zu müssen |
|
def document_retrieval_chroma2(): |
|
#OpenAI embeddings ------------------------------- |
|
embeddings = OpenAIEmbeddings() |
|
|
|
#HF embeddings ----------------------------------- |
|
#Alternative Embedding - für Vektorstore, um Ähnlichkeitsvektoren zu erzeugen - die ...InstructEmbedding ist sehr rechenaufwendig |
|
#embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}) |
|
#etwas weniger rechenaufwendig: |
|
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}) |
|
#oder einfach ohne Langchain: |
|
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
#ChromaDb um die embedings zu speichern |
|
db = Chroma(embedding_function = embeddings, persist_directory = PATH_WORK + CHROMA_DIR) |
|
print ("Chroma DB bereit ...................") |
|
|
|
return db |
|
|
|
########################################### |
|
#dokumente in mongo db vektorisiert ablegen können - die Db vorbereiten daüfür |
|
def document_retrieval_mongodb(llm, prompt): |
|
db = MongoDBAtlasVectorSearch.from_connection_string(MONGODB_URI, |
|
MONGODB_DB_NAME + "." + MONGODB_COLLECTION_NAME, |
|
OpenAIEmbeddings(disallowed_special = ()), |
|
index_name = MONGODB_INDEX_NAME) |
|
return db |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def llm_chain(llm, prompt): |
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT) |
|
result = llm_chain.run({"question": prompt}) |
|
return result |
|
|
|
|
|
def llm_chain2(llm, prompt): |
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT2) |
|
result = llm_chain.run({"question": prompt}) |
|
return result |
|
|
|
|
|
def rag_chain(llm, prompt, vectorstore, retriever): |
|
|
|
workflow = StateGraph(GraphState) |
|
|
|
|
|
workflow.add_node("retrieve", retrieve) |
|
workflow.add_node("grade_documents", grade_documents) |
|
workflow.add_node("generate", generate) |
|
|
|
workflow.add_node("transform_query", transform_query) |
|
|
|
|
|
|
|
|
|
workflow.add_node("retrieve_redirect", retrieve) |
|
|
|
|
|
workflow.set_entry_point("retrieve") |
|
workflow.add_edge("retrieve", "grade_documents") |
|
workflow.add_conditional_edges( |
|
"grade_documents", |
|
decide_to_generate, |
|
{ |
|
"transform_query": "transform_query", |
|
|
|
"generate": "generate", |
|
}, |
|
) |
|
workflow.add_edge("transform_query", "retrieve_redirect") |
|
workflow.add_edge("retrieve_redirect", "retrieve") |
|
|
|
|
|
workflow.add_edge("generate", END) |
|
|
|
app = workflow.compile() |
|
|
|
inputs = {"keys": {"question": prompt}} |
|
for output in app.stream(inputs): |
|
for key, value in output.items(): |
|
|
|
pprint.pprint(f"Node '{key}':") |
|
|
|
|
|
pprint.pprint("\n---\n") |
|
|
|
|
|
return value['keys']['generation'] |
|
|
|
|
|
|
|
|
|
|
|
def rag_chain2(prompt, db, k=3): |
|
rag_template = "Nutze die folgenden Kontext Teile am Ende, um die Frage zu beantworten . " + template + "Frage: " + prompt + "Kontext Teile: " |
|
retrieved_chunks = db.similarity_search(prompt, k) |
|
|
|
neu_prompt = rag_template |
|
for i, chunk in enumerate(retrieved_chunks): |
|
neu_prompt += f"{i+1}. {chunk}\n" |
|
|
|
return neu_prompt |
|
|
|
|
|
|
|
|
|
|
|
def generate_prompt_with_history(text, history, max_length=4048): |
|
|
|
|
|
prompt="" |
|
history = ["\n{}\n{}".format(x[0],x[1]) for x in history] |
|
history.append("\n{}\n".format(text)) |
|
history_text = "" |
|
flag = False |
|
for x in history[::-1]: |
|
history_text = x + history_text |
|
flag = True |
|
print("hist+prompt: ") |
|
print(history_text) |
|
if flag: |
|
return prompt+history_text |
|
else: |
|
return None |
|
|
|
|
|
|
|
|
|
def generate_prompt_with_history_openai(prompt, history): |
|
history_openai_format = [] |
|
for human, assistant in history: |
|
history_openai_format.append({"role": "user", "content": human }) |
|
history_openai_format.append({"role": "assistant", "content":assistant}) |
|
|
|
history_openai_format.append({"role": "user", "content": prompt}) |
|
print("openai history und prompt................") |
|
print(history_openai_format) |
|
return history_openai_format |
|
|
|
|
|
|
|
def generate_prompt_with_history_hf(prompt, history): |
|
history_transformer_format = history + [[prompt, ""]] |
|
|
|
|
|
messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]]) |
|
for item in history_transformer_format]) |
|
|
|
|
|
|
|
def generate_prompt_with_history_langchain(prompt, history): |
|
history_langchain_format = [] |
|
for human, ai in history: |
|
history_langchain_format.append(HumanMessage(content=human)) |
|
history_langchain_format.append(AIMessage(content=ai)) |
|
history_langchain_format.append(HumanMessage(content=prompt)) |
|
|
|
return history_langchain_format |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_image(image_path, prompt, model_image, oai_key): |
|
|
|
with open(image_path, "rb") as image_file: |
|
encoded_string = base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {oai_key}" |
|
} |
|
payload = { |
|
"model": model_image, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": llm_template + prompt |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{encoded_string}" |
|
} |
|
} |
|
] |
|
} |
|
], |
|
"max_tokens": 300 |
|
} |
|
return headers, payload |
|
|
|
|
|
def process_chatverlauf(prompt, model, oai_key): |
|
|
|
if (len(prompt)>50): |
|
prompt = prompt[:50] |
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {oai_key}" |
|
} |
|
payload = { |
|
"model": model, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": 'Gib folgendem Text eine Überschrift mit maximal 2 Worten' + prompt |
|
}, |
|
] |
|
} |
|
], |
|
"max_tokens": 100 |
|
} |
|
return headers, payload |
|
|
|
def process_chatverlauf_hf(history, llm): |
|
input = generate_prompt_with_history("Gib folgendem Text eine Überschrift mit maximal 3 Worten", history) |
|
result = llm_chain2(llm, input) |
|
return result |
|
|
|
|
|
|
|
def save_and_download(chat_history): |
|
|
|
with NamedTemporaryFile(delete=False, mode="w", suffix=".txt", dir="./temp") as tmp: |
|
temp_file_path = tmp.name |
|
tmp.write(chat_history) |
|
return temp_file_path |
|
|
|
def cleanup(file_path): |
|
if os.path.exists(file_path): |
|
os.remove(file_path) |
|
|
|
|
|
|
|
|
|
|
|
def markdown_to_html_with_syntax_highlight(md_str): |
|
def replacer(match): |
|
lang = match.group(1) or "text" |
|
code = match.group(2) |
|
lang = lang.strip() |
|
|
|
if lang=="text": |
|
lexer = guess_lexer(code) |
|
lang = lexer.name |
|
|
|
try: |
|
lexer = get_lexer_by_name(lang, stripall=True) |
|
except ValueError: |
|
lexer = get_lexer_by_name("python", stripall=True) |
|
formatter = HtmlFormatter() |
|
|
|
highlighted_code = highlight(code, lexer, formatter) |
|
|
|
return f'<pre><code class="{lang}">{highlighted_code}</code></pre>' |
|
|
|
code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```" |
|
md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE) |
|
|
|
html_str = markdown(md_str) |
|
return html_str |
|
|
|
|
|
def normalize_markdown(md_text: str) -> str: |
|
lines = md_text.split("\n") |
|
normalized_lines = [] |
|
inside_list = False |
|
|
|
for i, line in enumerate(lines): |
|
if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()): |
|
if not inside_list and i > 0 and lines[i - 1].strip() != "": |
|
normalized_lines.append("") |
|
inside_list = True |
|
normalized_lines.append(line) |
|
elif inside_list and line.strip() == "": |
|
if i < len(lines) - 1 and not re.match( |
|
r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip() |
|
): |
|
normalized_lines.append(line) |
|
continue |
|
else: |
|
inside_list = False |
|
normalized_lines.append(line) |
|
|
|
return "\n".join(normalized_lines) |
|
|
|
|
|
def convert_mdtext(md_text): |
|
code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL) |
|
inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL) |
|
code_blocks = code_block_pattern.findall(md_text) |
|
non_code_parts = code_block_pattern.split(md_text)[::2] |
|
|
|
result = [] |
|
for non_code, code in zip(non_code_parts, code_blocks + [""]): |
|
if non_code.strip(): |
|
non_code = normalize_markdown(non_code) |
|
if inline_code_pattern.search(non_code): |
|
result.append(markdown(non_code, extensions=["tables"])) |
|
else: |
|
result.append(mdtex2html.convert(non_code, extensions=["tables"])) |
|
if code.strip(): |
|
code = f"\n```{code}\n\n```" |
|
code = markdown_to_html_with_syntax_highlight(code) |
|
result.append(code) |
|
result = "".join(result) |
|
result += ALREADY_CONVERTED_MARK |
|
return result |
|
|
|
def convert_asis(userinput): |
|
return f"<p style=\"white-space:pre-wrap;\">{html.escape(userinput)}</p>"+ALREADY_CONVERTED_MARK |
|
|
|
def detect_converted_mark(userinput): |
|
if userinput.endswith(ALREADY_CONVERTED_MARK): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
|
|
def detect_language(code): |
|
if code.startswith("\n"): |
|
first_line = "" |
|
else: |
|
first_line = code.strip().split("\n", 1)[0] |
|
language = first_line.lower() if first_line else "" |
|
code_without_language = code[len(first_line) :].lstrip() if first_line else code |
|
return language, code_without_language |
|
|
|
def convert_to_markdown(text): |
|
text = text.replace("$","$") |
|
def replace_leading_tabs_and_spaces(line): |
|
new_line = [] |
|
|
|
for char in line: |
|
if char == "\t": |
|
new_line.append("	") |
|
elif char == " ": |
|
new_line.append(" ") |
|
else: |
|
break |
|
return "".join(new_line) + line[len(new_line):] |
|
|
|
markdown_text = "" |
|
lines = text.split("\n") |
|
in_code_block = False |
|
|
|
for line in lines: |
|
if in_code_block is False and line.startswith("```"): |
|
in_code_block = True |
|
markdown_text += f"{line}\n" |
|
elif in_code_block is True and line.startswith("```"): |
|
in_code_block = False |
|
markdown_text += f"{line}\n" |
|
elif in_code_block: |
|
markdown_text += f"{line}\n" |
|
else: |
|
line = replace_leading_tabs_and_spaces(line) |
|
line = re.sub(r"^(#)", r"\\\1", line) |
|
markdown_text += f"{line} \n" |
|
|
|
return markdown_text |
|
|
|
def add_language_tag(text): |
|
def detect_language(code_block): |
|
try: |
|
lexer = guess_lexer(code_block) |
|
return lexer.name.lower() |
|
except ClassNotFound: |
|
return "" |
|
|
|
code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE) |
|
|
|
def replacement(match): |
|
code_block = match.group(2) |
|
if match.group(2).startswith("\n"): |
|
language = detect_language(code_block) |
|
if language: |
|
return f"```{language}{code_block}```" |
|
else: |
|
return f"```\n{code_block}```" |
|
else: |
|
return match.group(1) + code_block + "```" |
|
|
|
text2 = code_block_pattern.sub(replacement, text) |
|
return text2 |
|
|
|
def delete_last_conversation(chatbot, history): |
|
if len(chatbot) > 0: |
|
chatbot.pop() |
|
|
|
if len(history) > 0: |
|
history.pop() |
|
|
|
return ( |
|
chatbot, |
|
history, |
|
"Delete Done", |
|
) |
|
|
|
def reset_state(): |
|
return [], [], "Reset Done" |
|
|
|
def reset_textbox(): |
|
return gr.update(value=""),"" |
|
|
|
def cancel_outputing(): |
|
return "Stop Done" |
|
|
|
|
|
|
|
|
|
def analyze_file(file): |
|
file_extension = file.name.split('.')[-1] |
|
return file_extension |
|
|
|
|
|
|
|
def get_filename(file_pfad): |
|
parts = file_pfad.rsplit('/', 1) |
|
if len(parts) == 2: |
|
result = parts[1] |
|
else: |
|
result = "Ein Fehler im Filenamen ist aufgetreten..." |
|
return result |
|
|
|
|
|
|
|
|
|
def submit_message(assistant_id, thread, client, user_message): |
|
client.beta.threads.messages.create( |
|
thread_id=thread.id, role="user", content=user_message |
|
) |
|
return client.beta.threads.runs.create( |
|
thread_id=thread.id, |
|
assistant_id=assistant_id, |
|
) |
|
|
|
def get_response(thread, client, assi_id): |
|
return client.beta.threads.messages.list(thread_id=thread.id, order="asc") |
|
|
|
def create_thread_and_run(user_input, client, assi_id): |
|
thread = client.beta.threads.create() |
|
run = submit_message(assi_id, thread, client, user_input) |
|
return thread, run |
|
|
|
def pretty_print(messages): |
|
print("# Messages") |
|
for m in messages: |
|
print(f"{m.role}: {m.content[0].text.value}") |
|
print() |
|
|
|
|
|
def wait_on_run(run, thread, client): |
|
while run.status == "queued" or run.status == "in_progress": |
|
run = client.beta.threads.runs.retrieve( |
|
thread_id=thread.id, |
|
run_id=run.id, |
|
) |
|
time.sleep(0.5) |
|
return run |
|
|
|
|
|
|
|
def tavily_search(tavily_client, query): |
|
search_result = tavily_client.get_search_context(query, search_depth="advanced", max_tokens=8000) |
|
return search_result |
|
|
|
|
|
|
|
def hugchat_search(chatbot, query): |
|
search_result = chatbot.query(query, web_search=True) |
|
|
|
|
|
|
|
|
|
return search_result.text, search_result.link |
|
|
|
|
|
|
|
def openai_assistant_suche(client): |
|
assistant = client.beta.assistants.create( |
|
instructions=template, |
|
model="gpt-4-1106-preview", |
|
tools=[{ |
|
"type": "function", |
|
"function": { |
|
"name": "tavily_search", |
|
"description": "Get information on recent events from the web.", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"query": {"type": "string", "description": "Die Suchanfrage, die die KI nicht beantworten konnte, hier hinein"}, |
|
}, |
|
"required": ["query"] |
|
} |
|
} |
|
}] |
|
) |
|
return assistant |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_picture(history, prompt): |
|
client = OpenAI() |
|
response = client.images.generate(model="dall-e-3", prompt=prompt,size="1024x1024",quality="standard",n=1,) |
|
image_url = response.data[0].url |
|
|
|
response2 = requests.get(image_url) |
|
|
|
image = Image.open(response2.raw) |
|
return image |
|
|
|
|
|
|
|
|
|
""" |
|
#Aufzählungen in der History erkennen und auch als Auflistung darstellen |
|
def erkennen_und_formatieren_von_aufzaehlungen_backup(text, styles): |
|
# Aufzählungszeichen oder Nummerierungen erkennen |
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
nummerierung = [f'{i}.' for i in range(1, 11)] # Beispiel für einfache Nummerierungserkennung |
|
nummerierung2 = [f'{i}. ' for i in range(1, 11)] |
|
nummerierung3 = [f' {i}. ' for i in range(1, 11)] |
|
|
|
zeilen = text.split('\n') |
|
list_items = [] |
|
for zeile in zeilen: |
|
# Prüft, ob die Zeile mit einem Aufzählungszeichen oder einer Nummerierung beginnt |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen + nummerierung + nummerierung2 + nummerierung3) : |
|
# Entfernt das Aufzählungszeichen/Nummerierung für die Darstellung |
|
for zeichen in aufzaehlungszeichen + nummerierung + nummerierung2 + nummerierung3: |
|
if zeile.lstrip().startswith(zeichen): |
|
zeile = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
list_items.append(ListItem(Paragraph(zeile, styles['BodyText']))) |
|
else: |
|
# Wenn die Zeile nicht als Teil einer Aufzählung erkannt wird, breche die Schleife ab |
|
break |
|
if list_items: |
|
# Eine Aufzählung wurde erkannt |
|
return ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica') |
|
else: |
|
# Keine Aufzählung erkannt, gebe einen normalen Paragraph zurück |
|
return Paragraph(text, styles['BodyText']) |
|
|
|
#Aufzählungen in der History erkennen und auch als Auflistung darstellen |
|
def erkennen_und_formatieren_von_aufzaehlungen(text, styles): |
|
# Aufzählungszeichen oder Nummerierungen erkennen |
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
# Regulärer Ausdruck für Nummerierungen (z.B. "1.", "2.") |
|
# Verbesserter regulärer Ausdruck, der optionale Leerzeichen vor der Nummerierung berücksichtigt |
|
nummerierung_regex = r"^\s*\d+\.\s*" # Optional Leerzeichen, gefolgt von Ziffern und einem Punkt, dann Leerzeichen |
|
zeilen = text.split('\n') |
|
list_items = [] |
|
for zeile in zeilen: |
|
# Prüft, ob die Zeile mit einem Aufzählungszeichen beginnt |
|
print("zeile:.............................") |
|
print(zeile) |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen) or re.match(nummerierung_regex, zeile.lstrip()): |
|
# Entfernt das Aufzählungszeichen/Nummerierung für die Darstellung |
|
if (re.match(nummerierung_regex, zeile.lstrip())): |
|
cleaned_line = re.sub(nummerierung_regex, '', zeile.lstrip(), 1).lstrip() # Entfernt nummerierte Aufzählungszeichen |
|
else: |
|
for zeichen in aufzaehlungszeichen: |
|
if zeile.lstrip().startswith(zeichen): |
|
cleaned_line = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
print(cleaned_line) |
|
list_items.append(ListItem(Paragraph(cleaned_line, styles['BodyText']))) |
|
else: |
|
# Wenn die Zeile nicht als Teil einer Aufzählung erkannt wird, breche die Schleife ab |
|
# und behandle den gesamten Text als normalen Paragraphen, wenn keine Liste erkannt wurde |
|
if not list_items: |
|
return Paragraph(text, styles['BodyText']) |
|
break |
|
if list_items: |
|
# Eine Aufzählung wurde erkannt |
|
return ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica') |
|
else: |
|
# Keine Aufzählung erkannt, gebe einen normalen Paragraph zurück |
|
return Paragraph(text, styles['BodyText']) |
|
""" |
|
|
|
|
|
|
|
|
|
def verarbeite_text_und_aufzaehlungen(text, styles): |
|
|
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
|
|
|
|
nummerierung_regex = r"^\s*\d+\.\s*" |
|
|
|
zeilen = text.split('\n') |
|
elements = [] |
|
list_items = [] |
|
paragraph_text = [] |
|
|
|
for zeile in zeilen: |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen) or re.match(nummerierung_regex, zeile.lstrip()): |
|
|
|
if paragraph_text: |
|
elements.append(Paragraph(' '.join(paragraph_text), styles['BodyText'])) |
|
paragraph_text = [] |
|
|
|
|
|
if re.match(nummerierung_regex, zeile.lstrip()): |
|
cleaned_line = re.sub(nummerierung_regex, '', zeile.lstrip(), 1).lstrip() |
|
else: |
|
for zeichen in aufzaehlungszeichen: |
|
if zeile.lstrip().startswith(zeichen): |
|
cleaned_line = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
list_items.append(ListItem(Paragraph(cleaned_line, styles['BodyText']))) |
|
else: |
|
|
|
if list_items: |
|
|
|
elements.append(ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica')) |
|
list_items = [] |
|
paragraph_text.append(zeile) |
|
|
|
|
|
if paragraph_text: |
|
elements.append(Paragraph(' '.join(paragraph_text), styles['BodyText'])) |
|
if list_items: |
|
elements.append(ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica')) |
|
|
|
return elements |
|
|
|
|
|
|
|
|
|
def on_each_page(canvas, doc): |
|
page_width, page_height = A4 |
|
canvas.saveState() |
|
canvas.setFont('Times-Roman', 10) |
|
|
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
print(current_date) |
|
|
|
canvas.drawRightString(page_width - 72, page_height - 28, current_date) |
|
canvas.restoreState() |
|
|
|
|
|
def erstellePdf(file_path_download, ueberschrift, dic_history): |
|
|
|
elements = [] |
|
|
|
paper_size = A4 |
|
|
|
|
|
styles = getSampleStyleSheet() |
|
|
|
new_style = ParagraphStyle('NewStyle', fontName='Helvetica', fontSize=11) |
|
styles.add(new_style) |
|
|
|
line_style = ParagraphStyle('LineStyle', fontSize=4, leading=6, borderPadding=0, |
|
spaceBefore=0, spaceAfter=0, textColor='black') |
|
list_style = getSampleStyleSheet() |
|
|
|
|
|
|
|
title = Paragraph(ueberschrift, styles['Title']) |
|
headline_nutzer = Paragraph('Nutzer:', styles['Heading3']) |
|
headline_assi = Paragraph('Assistent:', styles['Heading3']) |
|
|
|
|
|
elements.append(title) |
|
for nutzer, assi in dic_history.items(): |
|
elements.append(headline_nutzer) |
|
p = Paragraph(nutzer, styles['NewStyle']) |
|
elements.append(p) |
|
|
|
elements.append(Spacer(1, 2*mm)) |
|
elements.append(headline_assi) |
|
element_check = verarbeite_text_und_aufzaehlungen(assi,list_style) |
|
|
|
for elem in element_check: |
|
if isinstance(elem, list): |
|
|
|
elements.extend(elem) |
|
else: |
|
|
|
elements.append(elem) |
|
|
|
|
|
elements.append(Spacer(1, 8*mm)) |
|
|
|
elements.append(Paragraph('_' * 100, line_style)) |
|
|
|
elements.append(Spacer(1, 8*mm)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc = CustomDocTemplate(file_path_download, pagesize=paper_size) |
|
|
|
doc.onPage = on_each_page |
|
doc.build(elements) |
|
|
|
|
|
|
|
|
|
def hash_input(input_string): |
|
return hashlib.sha256(input_string.encode()).hexdigest() |
|
|
|
|
|
|
|
|
|
def transfer_input(inputs): |
|
textbox = reset_textbox() |
|
return ( |
|
inputs, |
|
gr.update(value=""), |
|
gr.Button.update(visible=True), |
|
) |
|
|
|
|
|
|
|
|
|
|
|
class State: |
|
interrupted = False |
|
|
|
def interrupt(self): |
|
self.interrupted = True |
|
|
|
def recover(self): |
|
self.interrupted = False |
|
shared_state = State() |
|
|
|
|
|
|
|
|
|
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool: |
|
for stop_word in stop_words: |
|
if s.endswith(stop_word): |
|
return True |
|
for i in range(1, len(stop_word)): |
|
if s.endswith(stop_word[:i]): |
|
return True |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
class CustomDocTemplate(SimpleDocTemplate): |
|
def handle_pageBegin(self): |
|
|
|
self._handle_pageBegin() |
|
|
|
self.canv.saveState() |
|
self.canv.setFont('Helvetica', 10) |
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
|
|
self.canv.drawRightString(550, 800, current_date) |
|
self.canv.restoreState() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GraphState(TypedDict): |
|
""" |
|
Represents the state of our graph. |
|
|
|
Attributes: |
|
keys: A dictionary where each key is a string. |
|
""" |
|
keys: Dict[str, any] |
|
|
|
|
|
|
|
|
|
|
|
def retrieve(state): |
|
""" |
|
Retrieve documents |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): New keys added to state: documents, that contains retrieved documents, der wievielte Versuch gemacht wird |
|
""" |
|
print("---RETRIEVE ---") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
documents = retriever.get_relevant_documents(question) |
|
second_trial="ja" |
|
if 'second_trial' in state_dict: |
|
print("second time") |
|
second_trail = "ja" |
|
else: |
|
print("first time") |
|
second_trial="nein" |
|
return {"keys": {"documents": documents, "second_trial":second_trial, "question": question, }} |
|
|
|
|
|
def retrieve_redirect(state): |
|
""" |
|
Retrieve redirect (wenn nach transform:question neues retrieven gemacht werden soll) |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): New key added to state: second_trial |
|
""" |
|
print("---RETRIEVE REDIRECT---") |
|
second_trial="ja" |
|
state_dict = state["keys"] |
|
question= state_dict["question"] |
|
documents= state_dict["documents"] |
|
return {"keys": {"documents": documents, "second_trial":second_trial, "question": question, }} |
|
|
|
|
|
|
|
def generate(state): |
|
""" |
|
Generate answer |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): New key added to state, generation, that contains LLM generation |
|
""" |
|
print("---GENERATE---") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
documents = state_dict["documents"] |
|
|
|
|
|
prompt = hub.pull("rlm/rag-prompt") |
|
|
|
|
|
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, streaming=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_chain = prompt | llm | StrOutputParser() |
|
|
|
|
|
generation = rag_chain.invoke({"context": documents, "question": question}) |
|
return { |
|
"keys": {"documents": documents, "question": question, "generation": generation} |
|
} |
|
|
|
def generate_ohne(state): |
|
""" |
|
Generate answer |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): New key added to state, generation, that contains LLM generation |
|
""" |
|
print("---GENERATE OHNE---") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
|
|
|
|
|
|
prompt = PromptTemplate( |
|
template="""\Antworte in deutsch, wenn es nicht explizit anders gefordert wird. Wenn du die Antwort nicht kennst, antworte direkt, dass du es nicht weißt. |
|
Versuche nicht es zu umschreiben. Versuche nicht, die Antwort zu erfinden oder aufzumocken. Halte die Antwort kurz aber ausführlich genug und exakt. \n\n |
|
Hier ist die Useranfrage: {question} """, |
|
input_variables=["question"]) |
|
|
|
|
|
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, streaming=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_chain = prompt | llm | StrOutputParser() |
|
|
|
|
|
generation = llm_chain.invoke({ "question": question}) |
|
return { |
|
"keys": {"question": question, "generation": generation} |
|
} |
|
|
|
|
|
def grade_documents(state): |
|
""" |
|
Determines whether the retrieved documents are relevant to the question. |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): Updates documents key with relevant documents |
|
""" |
|
|
|
print("---CHECK RELEVANCE---") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
documents = state_dict["documents"] |
|
second_trial =state_dict["second_trial"] |
|
|
|
|
|
class grade(BaseModel): |
|
"""Binary score for relevance check.""" |
|
binary_score: str = Field(description="Relevanz Bewertung 'ja' oder 'nein'") |
|
|
|
|
|
model = ChatOpenAI(temperature=0.3, model="gpt-4-0125-preview", streaming=True) |
|
|
|
|
|
grade_tool_oai = convert_to_openai_tool(grade) |
|
|
|
|
|
llm_with_tool = model.bind( |
|
tools=[convert_to_openai_tool(grade_tool_oai)], |
|
tool_choice={"type": "function", "function": {"name": "grade"}}, |
|
) |
|
|
|
|
|
parser_tool = PydanticToolsParser(tools=[grade]) |
|
|
|
|
|
prompt = PromptTemplate( |
|
template="""Du bist ein Bewerter, der die Relevanz von einem erhaltenen Dokument zu einer Nutzeranfrage bewerten soll. \n |
|
Hier ist das erhaltene Dokument: \n\n {context} \n\n |
|
Hier ist die Nutzeranfrage: {question} \n |
|
Wenn das erhaltene Dokument Keywörter oder semantische Bedeutung in Bezug auf die Nutzeranfrage hat, bewerte es als relevant. \n |
|
Gib eine binäre Bewertung von 'ja' oder 'nein' Bewertung, um anzuzeigen ob das Dokuemnt relevant ist zur Nutzeranfrage oder nicht.""", |
|
input_variables=["context", "question"], |
|
) |
|
|
|
|
|
chain = prompt | llm_with_tool | parser_tool |
|
|
|
|
|
filtered_docs = [] |
|
anzahl_relevant = 0 |
|
search = "nein" |
|
for d in documents: |
|
score = chain.invoke({"question": question, "context": d.page_content}) |
|
grade = score[0].binary_score |
|
if grade == "ja": |
|
|
|
print("---Bewertung: Dokument ist relevant---") |
|
anzahl_relevant = anzahl_relevant +1 |
|
filtered_docs.append(d) |
|
else: |
|
print("---Bewertung: Dokument irrelevant---") |
|
search = "ja" |
|
continue |
|
|
|
if (anzahl_relevant>= len(documents)/2): |
|
search = "nein" |
|
print("second trial grade_docs:.....................") |
|
print(second_trial) |
|
return { |
|
"keys": { |
|
"documents": filtered_docs, |
|
"question": question, |
|
"search_again": search, |
|
"second_trial": second_trial |
|
} |
|
} |
|
|
|
|
|
def transform_query(state): |
|
""" |
|
Transform the query to produce a better question. |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): Updates question key with a re-phrased question |
|
""" |
|
|
|
print("---TRANSFORM QUERY---") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
documents = state_dict["documents"] |
|
|
|
|
|
prompt = PromptTemplate( |
|
template="""Du generierst Fragen, die optimiert sind für das Retrieval von Dokumenten. \n |
|
Schaue auf den input und versuche die zugrundeliegende Absicht / Bedeutung zu bewerten. \n |
|
Hier ist die ursprüngliche Frage: |
|
\n ------- \n |
|
{question} |
|
\n ------- \n |
|
Formuliere eine verbesserte Frage: """, |
|
input_variables=["question"], |
|
) |
|
|
|
|
|
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True) |
|
|
|
|
|
chain = prompt | model | StrOutputParser() |
|
better_question = chain.invoke({"question": question}) |
|
second_trial="ja" |
|
|
|
return {"keys": {"documents": documents, "question": better_question, "second_trial" : second_trial}} |
|
|
|
|
|
def web_search(state): |
|
""" |
|
Web search based on the re-phrased question using Tavily API. |
|
Args: |
|
state (dict): The current graph state |
|
Returns: |
|
state (dict): Updates documents key with appended web results |
|
""" |
|
|
|
print("---WEB Suche---") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
documents = state_dict["documents"] |
|
|
|
tool = TavilySearchResults() |
|
docs = tool.invoke({"query": question}) |
|
web_results = "\n".join([d["content"] for d in docs]) |
|
web_results = Document(page_content=web_results) |
|
documents.append(web_results) |
|
|
|
return {"keys": {"documents": documents, "question": question}} |
|
|
|
|
|
|
|
|
|
|
|
def decide_to_generate(state): |
|
""" |
|
Determines whether to generate an answer or re-generate a question for a new retriever question or generate without documents attached |
|
Args: |
|
state (dict): The current state of the agent, including all keys. |
|
Returns: |
|
str: Next node to call |
|
""" |
|
|
|
print("---ENTSCHEIDE ZU GENERIEREN---") |
|
print("current state") |
|
print(state["keys"]) |
|
print("-------------------------------") |
|
state_dict = state["keys"] |
|
question = state_dict["question"] |
|
filtered_documents = state_dict["documents"] |
|
search_again = state_dict["search_again"] |
|
second_trial=state_dict["second_trial"] |
|
|
|
|
|
if search_again == "ja" : |
|
if (not second_trial == "ja"): |
|
|
|
|
|
print("---ENTSCHEIDUNG: VERÄNDERE DIE FRAGE ---") |
|
return "transform_query" |
|
else: |
|
|
|
print("---ENTSCHEIDUNG: Generiere ohne Dokumente---") |
|
return "generate" |
|
else: |
|
|
|
print("---ENTSCHEIDUNG: GENERIERE---") |
|
return "generate" |
|
|