Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from peft import PeftModel, PeftConfig | |
from transformers import AutoModelForTokenClassification | |
def test_mask(model, sample): | |
""" | |
Masks the padded tokens in the input. | |
Args: | |
data (list): List of strings. | |
Returns: | |
dataset (list): List of dictionaries. | |
""" | |
tokens = dict() | |
input_tokens = [i + 3 for i in sample.encode('utf-8')] | |
input_tokens.append(0) # eos token | |
tokens['input_ids'] = torch.tensor([input_tokens], dtype=torch.int64, device=model.device) | |
# Create attention mask | |
tokens['attention_mask'] = torch.ones_like(tokens['input_ids'], dtype=torch.int64, device=model.device) | |
return tokens | |
def rewrite(model, data): | |
""" | |
Rewrites the input text with the model. | |
Args: | |
model (torch.nn.Module): Model. | |
data (dict): Dictionary containing 'input_ids' and 'attention_mask'. | |
Returns: | |
output (str): Rewritten text. | |
""" | |
with torch.no_grad(): | |
pred = torch.argmax(model(**data).logits, dim=2).squeeze(0) | |
output = list() # save the indices of the characters as list of integers | |
# Conversion table for Turkish characters {100: [300, 350], ...} | |
en2tr = {en: tr for tr, en in zip(list(map(list, map(str.encode, list('ÜİĞŞÇÖüığşçö')))), list(map(ord, list('UIGSCOuigsco'))))} | |
for inp, lab in zip((data['input_ids'].squeeze(0) - 3).tolist(), pred.tolist()): | |
if lab and inp in en2tr: | |
# if the model predicts a diacritic, replace it with the corresponding Turkish character | |
output.extend(en2tr[inp]) | |
elif inp >= 0: output.append(inp) | |
return bytes(output).decode() | |
def try_it(text): | |
sample = test_mask(model, text) | |
return rewrite(model, sample) | |
if __name__ == '__main__': | |
config = PeftConfig.from_pretrained("bite-the-byte/byt5-small-deASCIIfy-TR") | |
model = AutoModelForTokenClassification.from_pretrained("google/byt5-small") | |
model = PeftModel.from_pretrained(model, "bite-the-byte/byt5-small-deASCIIfy-TR") | |
diacritize_app = gr.Interface(fn=try_it, inputs="text", outputs="text") | |
diacritize_app.launch(share=True) |