Spaces:
Runtime error
Runtime error
import logging | |
from fastapi import FastAPI | |
from llama_index.core.memory import ChatMemoryBuffer | |
from llama_index.llms.llama_cpp import LlamaCPP | |
from transformers import AutoTokenizer | |
from llama_index.core import set_global_tokenizer | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.core import SimpleDirectoryReader | |
from llama_index.core import VectorStoreIndex | |
logging.basicConfig( | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
level=logging.INFO | |
) | |
logger = logging.getLogger(__name__) | |
logger.info("Запускаемся... 🥳🥳🥳") | |
app = FastAPI() | |
model_url = "https://huggingface.co./Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf" | |
SYSTEM_PROMPT = '' | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") | |
set_global_tokenizer( | |
AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct").encode | |
) | |
# embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2") | |
# | |
# documents = SimpleDirectoryReader("./data/").load_data() | |
def messages_to_prompt(messages): | |
messages = [{"role": m.role.value, "content": m.content} for m in messages] | |
prompt = tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
return prompt | |
def completion_to_prompt(completion): | |
messages = [ | |
{"role": "system", "content": SYSTEM_PROMPT or "Ответ должен быть точным, кратким и с юмором."}, | |
{"role": "user", "content": completion}, | |
] | |
prompt = tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
return prompt | |
llm = LlamaCPP( | |
# You can pass in the URL to a GGML model to download it automatically | |
model_url=model_url, | |
# optionally, you can set the path to a pre-downloaded model instead of model_url | |
model_path=None, | |
temperature=0.1, | |
max_new_tokens=256, | |
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
context_window=2046, | |
# kwargs to pass to __call__() | |
generate_kwargs={}, | |
# kwargs to pass to __init__() | |
# set to at least 1 to use GPU | |
model_kwargs={"n_gpu_layers": -1, "num_return_sequences": 1, "no_repeat_ngram_size": 2, "n_threads": 2}, | |
# transform inputs into Llama2 format | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt, | |
verbose=True, | |
) | |
memory = ChatMemoryBuffer.from_defaults(token_limit=3900) | |
index = VectorStoreIndex.from_documents({}) | |
chat_engine = index.as_chat_engine( | |
chat_mode="context", | |
memory=memory, | |
llm=llm, | |
context_propt=( | |
"Вы - чат-бот, способный нормально взаимодействовать.\n" | |
"Используйте предыдущую историю чата или приведенный выше контекст, чтобы взаимодействовать с пользователем и помогать ему." | |
) | |
) | |
def generate_response(completion_response): | |
try: | |
response_text = completion_response.text | |
return response_text.strip() if response_text else "Пустой ответ" | |
except Exception as e: | |
logger.error(f"Ошибка обработки ответа: {str(e)}") | |
return "Ошибка генерации" | |
def greet_json(): | |
return {"Hello": "World!"} | |
async def set_system_prompt(text: str): | |
logger.info('post/system-prompt') | |
# global SYSTEM_PROMPT | |
# SYSTEM_PROMPT = text | |
async def predict(text: str): | |
# Генерация ответа с помощью модели | |
logger.info('post/predict') | |
logger.info('ЗАПРОС:') | |
logger.info(text) | |
# response = llm.complete(text) | |
response = chat_engine.chat(text) | |
logger.info('ОТВЕТ:') | |
logger.info(response) | |
# text_response = generate_response(response) | |
# return {"response": text_response} | |
return {"response": response} |