import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import gc import torch def clear_memory(): gc.collect() torch.cuda.empty_cache() model_name = "GIGAParviz/Firooze_test" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name , low_cpu_mem_usage=True , device_map="cpu") model = model.to("cpu") model.gradient_checkpointing_enable() pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128) def generate_response(prompt): clear_memory() instruction = f"### Instruction:\n{prompt}\n\n### Response:\n" result = pipe(instruction) return result[0]['generated_text'][len(instruction):] with gr.Blocks() as demo: gr.Markdown("