Aleksandr Maiorov
v 0.1
c517f41
import logging
from fastapi import FastAPI
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.llms.llama_cpp import LlamaCPP
from transformers import AutoTokenizer
from llama_index.core import set_global_tokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
logger.info("Запускаемся... 🥳🥳🥳")
app = FastAPI()
model_url = "https://huggingface.co./Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"
SYSTEM_PROMPT = ''
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
set_global_tokenizer(
AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct").encode
)
# embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
#
# documents = SimpleDirectoryReader("./data/").load_data()
def messages_to_prompt(messages):
messages = [{"role": m.role.value, "content": m.content} for m in messages]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
def completion_to_prompt(completion):
messages = [
{"role": "system", "content": SYSTEM_PROMPT or "Ответ должен быть точным, кратким и с юмором."},
{"role": "user", "content": completion},
]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=2046,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1, "num_return_sequences": 1, "no_repeat_ngram_size": 2, "n_threads": 2},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
memory = ChatMemoryBuffer.from_defaults(token_limit=3900)
index = VectorStoreIndex.from_documents({})
chat_engine = index.as_chat_engine(
chat_mode="context",
memory=memory,
llm=llm,
context_propt=(
"Вы - чат-бот, способный нормально взаимодействовать.\n"
"Используйте предыдущую историю чата или приведенный выше контекст, чтобы взаимодействовать с пользователем и помогать ему."
)
)
def generate_response(completion_response):
try:
response_text = completion_response.text
return response_text.strip() if response_text else "Пустой ответ"
except Exception as e:
logger.error(f"Ошибка обработки ответа: {str(e)}")
return "Ошибка генерации"
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.put("/system-prompt")
async def set_system_prompt(text: str):
logger.info('post/system-prompt')
# global SYSTEM_PROMPT
# SYSTEM_PROMPT = text
@app.post("/predict")
async def predict(text: str):
# Генерация ответа с помощью модели
logger.info('post/predict')
logger.info('ЗАПРОС:')
logger.info(text)
# response = llm.complete(text)
response = chat_engine.chat(text)
logger.info('ОТВЕТ:')
logger.info(response)
# text_response = generate_response(response)
# return {"response": text_response}
return {"response": response}