Spaces:

emircanerol
/

diacritizeTR

Sleeping

App Files Files Community

emircanerol commited on May 10

Commit

a17b609

•

1 Parent(s): 1ca08a7

Add application file

Browse files

Files changed (1) hide show

app.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import gradio as gr
+import torch
+from peft import PeftModel, PeftConfig
+from transformers import AutoModelForTokenClassification
+def test_mask(model, sample):
+    """
+    Masks the padded tokens in the input.
+    Args:
+        data (list): List of strings.
+    Returns:
+        dataset (list): List of dictionaries.
+    """
+    tokens = dict()
+    input_tokens = [i + 3 for i in sample.encode('utf-8')]
+    input_tokens.append(0) # eos token
+    tokens['input_ids'] = torch.tensor([input_tokens], dtype=torch.int64, device=model.device)
+    # Create attention mask
+    tokens['attention_mask'] = torch.ones_like(tokens['input_ids'], dtype=torch.int64, device=model.device)
+    return tokens
+def rewrite(model, data):
+    """
+    Rewrites the input text with the model.
+    Args:
+        model (torch.nn.Module): Model.
+        data (dict): Dictionary containing 'input_ids' and 'attention_mask'.
+    Returns:
+        output (str): Rewritten text.
+    """
+    with torch.no_grad():
+        pred = torch.argmax(model(**data).logits, dim=2).squeeze(0)
+    output = list() # save the indices of the characters as list of integers
+    # Conversion table for Turkish characters {100: [300, 350], ...}
+    en2tr = {en: tr for tr, en in zip(list(map(list, map(str.encode, list('ÜİĞŞÇÖüığşçö')))), list(map(ord, list('UIGSCOuigsco'))))}
+    for inp, lab in zip((data['input_ids'].squeeze(0) - 3).tolist(), pred.tolist()):
+        if lab and inp in en2tr:
+            # if the model predicts a diacritic, replace it with the corresponding Turkish character
+            output.extend(en2tr[inp])
+        elif inp >= 0: output.append(inp)
+    return bytes(output).decode()
+def try_it(text):
+    sample = test_mask(model, text)
+    return rewrite(model, sample)
+if __name__ == '__main__':
+    config = PeftConfig.from_pretrained("bite-the-byte/byt5-small-deASCIIfy-TR")
+    model = AutoModelForTokenClassification.from_pretrained("google/byt5-small")
+    model = PeftModel.from_pretrained(model, "bite-the-byte/byt5-small-deASCIIfy-TR")
+    diacritize_app = gr.Interface(fn=try_it, inputs="text", outputs="text")
+    diacritize_app.launch(share=True)