import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
model_path = 'CjangCjengh/NomBert-hn2qn-v0.1'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModel.from_pretrained(model_path, torch_dtype='auto', trust_remote_code=True).eval().to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
def parse_text(input_text):
with torch.inference_mode():
output_text, output_probs = model.parse_nom_text(tokenizer, [input_text])
html_content = '
'
for item in output_probs[0]:
char = item['char']
candidates = item['candidates']
html_content += f'
{char}
'
html_content += '
'
for candidate, prob in candidates:
prob_percent = prob * 100
html_content += f'''
{candidate}: {prob_percent:.2f}%
'''
html_content += '
'
html_content += '
'
return output_text[0], html_content
if __name__=='__main__':
with gr.Blocks(css='#viz {height: 500px; overflow-y: scroll;}') as app:
gr.Markdown('## NomBERT - Hán Nôm to Quốc Ngữ Converter')
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(label='Input Hán Nôm Text', lines=5, placeholder='Enter Hán Nôm text here...')
parse_button = gr.Button('Parse')
output_text = gr.Textbox(label='Output Quốc Ngữ Text', lines=5, interactive=False)
with gr.Column(scale=2):
visualization = gr.HTML(label='Candidates Probabilities', elem_id='viz')
parse_button.click(
fn=parse_text,
inputs=input_text,
outputs=[output_text, visualization]
)
app.launch()