findemo3.6 / app.py
Danielrahmai1991's picture
Update app.py
97e04f5 verified
from threading import Thread
from transformers import TextStreamer, TextIteratorStreamer
from unsloth import FastLanguageModel
import torch
import gradio as gr
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model_name = "Danielrahmai1991/llama32_ganjoor_adapt_basic_model_16bit_v1"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
trust_remote_code=True,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
FastLanguageModel.for_inference(model)
print("model loaded")
import re
from deep_translator import (GoogleTranslator,
PonsTranslator,
LingueeTranslator,
MyMemoryTranslator,
YandexTranslator,
DeeplTranslator,
QcriTranslator,
single_detection,
batch_detection)
from pyaspeller import YandexSpeller
def error_correct_pyspeller(sample_text):
""" grammer correction of input text"""
speller = YandexSpeller()
fixed = speller.spelled(sample_text)
return fixed
def postprocerssing(inp_text: str):
"""Post preocessing of the llm response"""
inp_text = re.sub('<[^>]+>', '', inp_text)
inp_text = inp_text.split('##', 1)[0]
inp_text = error_correct_pyspeller(inp_text)
return inp_text
def llm_run(prompt, max_length, top_p, temprature, top_k, messages):
print("prompt, max_length, top_p, temprature, top_k, messages", prompt, max_length, top_p, temprature, top_k, messages)
lang = single_detection(prompt, api_key='4ab77f25578d450f0902fb42c66d5e11')
if lang == 'en':
prompt = error_correct_pyspeller(prompt)
en_translated = GoogleTranslator(source='auto', target='en').translate(prompt)
messages.append({"role": "user", "content": en_translated})
# messages.append({"role": "user", "content": prompt})
print("messages", messages)
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt = True,
return_tensors = "pt",
)
streamer = TextIteratorStreamer(
tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
)
generate_kwargs = dict(
max_length=int(max_length),top_p=float(top_p), do_sample=True,
top_k=int(top_k), streamer=streamer, temperature=float(temprature), repetition_penalty=1.2
)
t = Thread(target=model.generate, args=(input_ids,), kwargs=generate_kwargs)
t.start()
generated_text=[]
for text in streamer:
generated_text.append(text)
print('generated_text: ', generated_text)
# yield "".join(generated_text)
yield GoogleTranslator(source='auto', target=lang).translate("".join(generated_text))
messages.append({"role": "assistant", "content": "".join(generated_text)})
def clear_memory(messages):
messages.clear()
return "Memory cleaned."
with gr.Blocks(theme=gr.themes.Default(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.pink)) as demo:
stored_message = gr.State([])
with gr.Row():
with gr.Column(scale=2):
prompt_text = gr.Textbox(lines=7, label="Prompt", scale=2)
with gr.Row():
btn1 = gr.Button("Submit", scale=1)
btn2 = gr.Button("Clear", scale=1)
btn3 = gr.Button("Clean Memory", scale=2)
with gr.Column(scale=2):
out_text = gr.Text(lines=15, label="Output", scale=2)
btn1.click(fn=llm_run, inputs=[
prompt_text,
gr.Textbox(label="Max-Lenth generation", value=500),
gr.Slider(0.0, 1.0, label="Top-P value", value=0.90),
gr.Slider(0.0, 1.0, label="Temprature value", value=0.65),
gr.Textbox(label="Top-K", value=50,),
stored_message
], outputs=out_text)
btn2.click(lambda: [None, None], outputs=[prompt_text, out_text])
btn3.click(fn=clear_memory, inputs=[stored_message], outputs=[out_text])
# demo = gr.Interface(fn=llm_run, inputs=["text"], outputs="text")
demo.launch(debug=True, share=True)