Jordan Myers
commited on
Commit
·
0101c12
1
Parent(s):
5d5e348
more updates
Browse files- .gitignore +2 -0
- app.py +34 -73
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
__pycache__
|
app.py
CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
|
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
3 |
import torch
|
4 |
|
5 |
-
# this model was loaded from https://hf.co/models
|
6 |
model = AutoModelForSeq2SeqLM.from_pretrained("Jayyydyyy/m2m100_418m_tokipona")
|
7 |
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
|
8 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -38,62 +37,11 @@ def translate(text, src_lang, tgt_lang, candidates:int):
|
|
38 |
outs = model.generate(**{**ins, **gen_args})
|
39 |
output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
|
40 |
|
41 |
-
return output
|
42 |
-
|
43 |
-
# app = gr.Interface(
|
44 |
-
# fn=translate,
|
45 |
-
# inputs=[
|
46 |
-
# gr.components.Textbox(label="Text"),
|
47 |
-
# gr.components.Dropdown(label="Source Language", choices=list(LANG_CODES.keys())),
|
48 |
-
# gr.components.Dropdown(label="Target Language", choices=list(LANG_CODES.keys())),
|
49 |
-
# gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)
|
50 |
-
# ],
|
51 |
-
# outputs=["text"],
|
52 |
-
# examples=[
|
53 |
-
# ["Welcome to my translation app.", "English", "toki pona", 3],
|
54 |
-
# ["Its not always perfect, but its pretty okay!", "English", "toki pona", 3],
|
55 |
-
# ["ilo pi ante toki ni li pona a!", "toki pona", "English", 3],
|
56 |
-
# ["kijetesantakalu li pona", "toki pona", "English", 3],
|
57 |
-
# ["mi li toki e toki pona", "toki pona", "toki pona", 3]
|
58 |
-
# ],
|
59 |
-
# cache_examples=False,
|
60 |
-
# article="""
|
61 |
-
# # A simple English / toki pona Neural Machine Translation App!
|
62 |
-
|
63 |
-
# ### toki a! 💬
|
64 |
-
|
65 |
-
# This is a simple english to toki pona / toki pona to english neural machine translation app.
|
66 |
-
|
67 |
-
# Input your text to translate, a source language and target language, and desired number of return sequences!
|
68 |
-
|
69 |
-
# ### Grammaticality / Regularization
|
70 |
-
# English -> English and/or toki pona -> toki pona will result in some form of regularization.
|
71 |
-
|
72 |
-
# This can approximate grammaticality, but it isn't always the best.
|
73 |
-
|
74 |
-
# For example, "mi li toki e toki pona" [src: toki pona, tgt: toki pona] will result in ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
|
75 |
-
# (Thus, the ungrammatical "li" is dropped)
|
76 |
-
|
77 |
-
# ### Model and Data
|
78 |
-
# This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model.
|
79 |
-
|
80 |
-
# By leveraging the pretrained weights of the massively multilingual M2M100 model,
|
81 |
-
# we can jumpstart our transfer learning to accomplish machine translation for toki pona!
|
82 |
-
|
83 |
-
# The model was fine-tuned on the English/toki pona bitexts found at https://tatoeba.org/
|
84 |
-
|
85 |
-
# ### This app is a work in progress and obviously not all translations will be perfect.
|
86 |
-
# In addition to parameter quantity and the hyper-parameters used while training,
|
87 |
-
# the *quality of data* found on Tatoeba directly influences the perfomance of projects like this!
|
88 |
-
|
89 |
-
# If you wish to contribute, please simply add high quality and diverse translations to Tatoeba!
|
90 |
-
# """,
|
91 |
-
# title="English / toki pona Translation"
|
92 |
-
# )
|
93 |
|
94 |
with gr.Blocks() as app:
|
95 |
-
|
96 |
-
# A
|
97 |
|
98 |
### toki a! 💬
|
99 |
|
@@ -101,13 +49,15 @@ with gr.Blocks() as app:
|
|
101 |
|
102 |
Input your text to translate, a source language and target language, and desired number of return sequences!
|
103 |
|
104 |
-
###
|
105 |
-
|
|
|
106 |
|
107 |
-
|
108 |
|
109 |
-
For example, "mi li toki e toki pona"
|
110 |
-
|
|
|
111 |
|
112 |
### Model and Data
|
113 |
This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model.
|
@@ -121,19 +71,30 @@ with gr.Blocks() as app:
|
|
121 |
In addition to parameter quantity and the hyper-parameters used while training,
|
122 |
the *quality of data* found on Tatoeba directly influences the perfomance of projects like this!
|
123 |
|
124 |
-
If you wish to contribute, please
|
125 |
"""
|
126 |
-
|
127 |
-
|
128 |
-
gr.
|
129 |
-
gr.
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
app.launch()
|
|
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
3 |
import torch
|
4 |
|
|
|
5 |
model = AutoModelForSeq2SeqLM.from_pretrained("Jayyydyyy/m2m100_418m_tokipona")
|
6 |
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
|
7 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
37 |
outs = model.generate(**{**ins, **gen_args})
|
38 |
output = tokenizer.batch_decode(outs.sequences, skip_special_tokens=True)
|
39 |
|
40 |
+
return '\n'.join(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
with gr.Blocks() as app:
|
43 |
+
markdown="""
|
44 |
+
# A Simple English / toki pona Neural Machine Translation App!
|
45 |
|
46 |
### toki a! 💬
|
47 |
|
|
|
49 |
|
50 |
Input your text to translate, a source language and target language, and desired number of return sequences!
|
51 |
|
52 |
+
### Grammar Regularization
|
53 |
+
An interesting quirk of training a many-to-many translation model is that pseudo-grammar correction
|
54 |
+
can be achieved by translating *from* **language A** *to* **language A**
|
55 |
|
56 |
+
Remember, this can ***approximate*** grammaticality, but it isn't always the best.
|
57 |
|
58 |
+
For example, "mi li toki e toki pona" (Source Language: toki pona & Target Language: toki pona) will result in:
|
59 |
+
- ['mi toki e toki pona.', 'mi toki pona.', 'mi toki e toki pona']
|
60 |
+
- (Thus, the ungrammatical "li" is dropped)
|
61 |
|
62 |
### Model and Data
|
63 |
This app utilizes a fine-tuned version of Facebook/Meta AI's M2M100 418M param model.
|
|
|
71 |
In addition to parameter quantity and the hyper-parameters used while training,
|
72 |
the *quality of data* found on Tatoeba directly influences the perfomance of projects like this!
|
73 |
|
74 |
+
If you wish to contribute, please add high quality and diverse translations to Tatoeba!
|
75 |
"""
|
76 |
+
|
77 |
+
with gr.Row():
|
78 |
+
gr.Markdown(markdown)
|
79 |
+
with gr.Column():
|
80 |
+
input_text = gr.components.Textbox(label="Input Text", value="Raccoons are fascinating creatures, but I prefer opossums.")
|
81 |
+
source_lang = gr.components.Dropdown(label="Source Language", value="English", choices=list(LANG_CODES.keys()))
|
82 |
+
target_lang = gr.components.Dropdown(label="Target Language", value="toki pona", choices=list(LANG_CODES.keys()))
|
83 |
+
return_seqs = gr.Slider(label="Number of return sequences", value=3, minimum=1, maximum=12, step=1)
|
84 |
+
|
85 |
+
inputs=[input_text, source_lang, target_lang, return_seqs]
|
86 |
+
outputs = gr.Textbox()
|
87 |
+
|
88 |
+
translate_btn = gr.Button("Translate! | o ante toki!")
|
89 |
+
translate_btn.click(translate, inputs=inputs, outputs=outputs)
|
90 |
+
|
91 |
+
gr.Examples(
|
92 |
+
[
|
93 |
+
["Hello! How are you?", "English", "toki pona", 3],
|
94 |
+
["toki a! ilo pi ante toki ni li pona!", "toki pona", "English", 3],
|
95 |
+
["mi toki e toki pona", "toki pona", "toki pona", 3],
|
96 |
+
],
|
97 |
+
inputs=inputs
|
98 |
+
)
|
99 |
|
100 |
app.launch()
|