|
import spaces |
|
import gradio as gr |
|
|
|
|
|
|
|
import re |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") |
|
model = AutoModelForCausalLM.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
model = model.to(device) |
|
|
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{} |
|
|
|
### Input: |
|
{} |
|
|
|
### Response: |
|
{}""" |
|
|
|
|
|
@spaces.GPU |
|
def get_response(input_text: str) -> str: |
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
"Please answer the following sentence as requested", |
|
input_text, |
|
"", |
|
) |
|
], |
|
return_tensors="pt", |
|
).to(device) |
|
|
|
outputs = model.generate(**inputs, max_new_tokens=256, use_cache=True) |
|
output = tokenizer.batch_decode(outputs)[0] |
|
response_pattern = re.compile(r"### Response:\n(.*?)<eos>", re.DOTALL) |
|
response_match = response_pattern.search(output) |
|
|
|
if response_match: |
|
response = response_match.group(1).strip() |
|
return response |
|
else: |
|
return "Response not found" |
|
|
|
|
|
interface = gr.Interface( |
|
fn=get_response, |
|
inputs=[ |
|
gr.Textbox( |
|
label="Enter your input text here", |
|
value="Germany ka capital city kya hai?", |
|
placeholder="Input to LLM", |
|
lines=5, |
|
) |
|
], |
|
outputs=[gr.Textbox(label="LLM Output", lines=5)], |
|
title="Gemma Hinglish Model Inference", |
|
description="π€ + π¦₯ = π₯ This model is based on google/gemma-2b and has been LoRA fine-tuned on English & Hindi language instruction datasets", |
|
) |
|
interface.launch() |
|
|