tiedeman commited on
Commit
9abc071
1 Parent(s): 78b2587

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - aai
5
+ - ace
6
+ - agn
7
+ - aia
8
+ - akl
9
+ - alj
10
+ - alp
11
+ - amk
12
+ - aoz
13
+ - apr
14
+ - atq
15
+ - aui
16
+ - ban
17
+ - bcl
18
+ - bep
19
+ - bhz
20
+ - bik
21
+ - bku
22
+ - blz
23
+ - bmk
24
+ - bnp
25
+ - bpr
26
+ - bps
27
+ - btd
28
+ - bth
29
+ - bto
30
+ - bts
31
+ - btx
32
+ - bug
33
+ - buk
34
+ - bvy
35
+ - bzh
36
+ - ceb
37
+ - cgc
38
+ - ch
39
+ - dad
40
+ - de
41
+ - dob
42
+ - dtp
43
+ - dww
44
+ - emi
45
+ - en
46
+ - es
47
+ - far
48
+ - fil
49
+ - fj
50
+ - fr
51
+ - frd
52
+ - gfk
53
+ - gil
54
+ - gor
55
+ - haw
56
+ - hil
57
+ - hla
58
+ - hnn
59
+ - hot
60
+ - hvn
61
+ - iba
62
+ - id
63
+ - ifa
64
+ - ifb
65
+ - ifk
66
+ - ifu
67
+ - ify
68
+ - ilo
69
+ - iry
70
+ - itv
71
+ - jv
72
+ - jvn
73
+ - kbm
74
+ - khz
75
+ - kje
76
+ - kne
77
+ - kpg
78
+ - kqe
79
+ - kqf
80
+ - kqw
81
+ - krj
82
+ - kud
83
+ - kwf
84
+ - kzf
85
+ - laa
86
+ - law
87
+ - lcm
88
+ - leu
89
+ - lew
90
+ - lex
91
+ - lid
92
+ - ljp
93
+ - lnd
94
+ - mad
95
+ - mak
96
+ - mbb
97
+ - mbf
98
+ - mbt
99
+ - mee
100
+ - mek
101
+ - mg
102
+ - mgm
103
+ - mh
104
+ - mhy
105
+ - mi
106
+ - mmo
107
+ - mmx
108
+ - mna
109
+ - mnb
110
+ - mog
111
+ - mox
112
+ - mpx
113
+ - mqj
114
+ - mrw
115
+ - ms
116
+ - msm
117
+ - mta
118
+ - mva
119
+ - mvp
120
+ - mvv
121
+ - mwc
122
+ - mwv
123
+ - myw
124
+ - mzz
125
+ - na
126
+ - nak
127
+ - nia
128
+ - nij
129
+ - niu
130
+ - npy
131
+ - nsn
132
+ - nss
133
+ - nwi
134
+ - obo
135
+ - pag
136
+ - pam
137
+ - pau
138
+ - plw
139
+ - pmf
140
+ - pmy
141
+ - pne
142
+ - ppk
143
+ - prf
144
+ - pt
145
+ - ptp
146
+ - ptu
147
+ - pwg
148
+ - rai
149
+ - rap
150
+ - rej
151
+ - rro
152
+ - rug
153
+ - sas
154
+ - sbl
155
+ - sda
156
+ - sgb
157
+ - sgz
158
+ - sm
159
+ - smk
160
+ - sml
161
+ - snc
162
+ - sps
163
+ - stn
164
+ - su
165
+ - swp
166
+ - sxn
167
+ - tbc
168
+ - tbl
169
+ - tbo
170
+ - tet
171
+ - tgo
172
+ - tgp
173
+ - tkl
174
+ - tl
175
+ - tlx
176
+ - to
177
+ - tpa
178
+ - tpz
179
+ - tte
180
+ - tuc
181
+ - tvl
182
+ - twb
183
+ - twu
184
+ - txa
185
+ - ty
186
+ - ubr
187
+ - uvl
188
+ - viv
189
+ - war
190
+ - wed
191
+ - wuv
192
+ - xsb
193
+ - xsi
194
+ - yml
195
+
196
+ tags:
197
+ - translation
198
+ - opus-mt-tc-bible
199
+
200
+ license: apache-2.0
201
+ model-index:
202
+ - name: opus-mt-tc-bible-big-poz-deu_eng_fra_por_spa
203
+ results:
204
+ - task:
205
+ name: Translation multi-multi
206
+ type: translation
207
+ args: multi-multi
208
+ dataset:
209
+ name: tatoeba-test-v2020-07-28-v2023-09-26
210
+ type: tatoeba_mt
211
+ args: multi-multi
212
+ metrics:
213
+ - name: BLEU
214
+ type: bleu
215
+ value: 30.8
216
+ - name: chr-F
217
+ type: chrf
218
+ value: 0.48052
219
+ ---
220
+ # opus-mt-tc-bible-big-poz-deu_eng_fra_por_spa
221
+
222
+ ## Table of Contents
223
+ - [Model Details](#model-details)
224
+ - [Uses](#uses)
225
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
226
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
227
+ - [Training](#training)
228
+ - [Evaluation](#evaluation)
229
+ - [Citation Information](#citation-information)
230
+ - [Acknowledgements](#acknowledgements)
231
+
232
+ ## Model Details
233
+
234
+ Neural machine translation model for translating from Malayo-Polynesian languages (poz) to unknown (deu+eng+fra+por+spa).
235
+
236
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
237
+ **Model Description:**
238
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
239
+ - **Model Type:** Translation (transformer-big)
240
+ - **Release**: 2024-05-30
241
+ - **License:** Apache-2.0
242
+ - **Language(s):**
243
+ - Source Language(s): aai ace agn aia akl alj alp amk aoz apr atq aui ban bcl bep bhz bik bku blz bmk bnp bpr bps btd bth bto bts btx bug buk bvy bzh ceb cgc cha dad dob dtp dww emi far fij fil frd gfk gil gor haw hil hla hnn hot hvn iba ifa ifb ifk ifu ify ilo ind iry itv jak jav jvn kbm khz kje kne kpg kqe kqf kqw krj kud kwf kzf laa law lcm leu lew lex lid ljp lnd mad mah mak max mbb mbf mbt mee mek mgm mhy mlg mmo mmx mna mnb mog mox mpx mqj mri mrw msa msm mta mva mvp mvv mwc mwv myw mzz nak nau nia nij niu npy nsn nss nwi obo pag pam pau plt plw pmf pmy pne ppk prf ptp ptu pwg rai rap rej rro rug sas sbl sda sgb sgz smk sml smo snc sps stn sun swp sxn tah tbc tbl tbo tet tgl tgo tgp tkl tlx tmw ton tpa tpz tte tuc tvl twb twu txa ubr uvl viv war wed wuv xsb xsi yml zlm zsm
244
+ - Target Language(s): deu eng fra por spa
245
+ - Valid Target Language Labels: >>deu<< >>eng<< >>fra<< >>por<< >>spa<< >>xxx<<
246
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
247
+ - **Resources for more information:**
248
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/poz-deu%2Beng%2Bfra%2Bpor%2Bspa/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
249
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
250
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
251
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
252
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
253
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
254
+
255
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>deu<<`
256
+
257
+ ## Uses
258
+
259
+ This model can be used for translation and text-to-text generation.
260
+
261
+ ## Risks, Limitations and Biases
262
+
263
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
264
+
265
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
266
+
267
+ ## How to Get Started With the Model
268
+
269
+ A short example code:
270
+
271
+ ```python
272
+ from transformers import MarianMTModel, MarianTokenizer
273
+
274
+ src_text = [
275
+ ">>deu<< Replace this with text in an accepted source language.",
276
+ ">>spa<< This is the second sentence."
277
+ ]
278
+
279
+ model_name = "pytorch-models/opus-mt-tc-bible-big-poz-deu_eng_fra_por_spa"
280
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
281
+ model = MarianMTModel.from_pretrained(model_name)
282
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
283
+
284
+ for t in translated:
285
+ print( tokenizer.decode(t, skip_special_tokens=True) )
286
+ ```
287
+
288
+ You can also use OPUS-MT models with the transformers pipelines, for example:
289
+
290
+ ```python
291
+ from transformers import pipeline
292
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-poz-deu_eng_fra_por_spa")
293
+ print(pipe(">>deu<< Replace this with text in an accepted source language."))
294
+ ```
295
+
296
+ ## Training
297
+
298
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
299
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
300
+ - **Model Type:** transformer-big
301
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
302
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
303
+
304
+ ## Evaluation
305
+
306
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/poz-deu%2Beng%2Bfra%2Bpor%2Bspa/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
307
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt)
308
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/poz-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt)
309
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
310
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
311
+
312
+ | langpair | testset | chr-F | BLEU | #sent | #words |
313
+ |----------|---------|-------|-------|-------|--------|
314
+ | multi-multi | tatoeba-test-v2020-07-28-v2023-09-26 | 0.48052 | 30.8 | 10000 | 76785 |
315
+
316
+ ## Citation Information
317
+
318
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
319
+
320
+ ```bibtex
321
+ @article{tiedemann2023democratizing,
322
+ title={Democratizing neural machine translation with {OPUS-MT}},
323
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
324
+ journal={Language Resources and Evaluation},
325
+ number={58},
326
+ pages={713--755},
327
+ year={2023},
328
+ publisher={Springer Nature},
329
+ issn={1574-0218},
330
+ doi={10.1007/s10579-023-09704-w}
331
+ }
332
+
333
+ @inproceedings{tiedemann-thottingal-2020-opus,
334
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
335
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
336
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
337
+ month = nov,
338
+ year = "2020",
339
+ address = "Lisboa, Portugal",
340
+ publisher = "European Association for Machine Translation",
341
+ url = "https://aclanthology.org/2020.eamt-1.61",
342
+ pages = "479--480",
343
+ }
344
+
345
+ @inproceedings{tiedemann-2020-tatoeba,
346
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
347
+ author = {Tiedemann, J{\"o}rg},
348
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
349
+ month = nov,
350
+ year = "2020",
351
+ address = "Online",
352
+ publisher = "Association for Computational Linguistics",
353
+ url = "https://aclanthology.org/2020.wmt-1.139",
354
+ pages = "1174--1182",
355
+ }
356
+ ```
357
+
358
+ ## Acknowledgements
359
+
360
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
361
+
362
+ ## Model conversion info
363
+
364
+ * transformers version: 4.45.1
365
+ * OPUS-MT git hash: 0882077
366
+ * port time: Tue Oct 8 12:53:48 EEST 2024
367
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ multi-multi tatoeba-test-v2020-07-28-v2023-09-26 0.48052 30.8 10000 76785
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-poz-deu_eng_fra_por_spa",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 60413,
17
+ "decoder_vocab_size": 60414,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 269,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 60413,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 60414
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 60413
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 60413,
10
+ "eos_token_id": 269,
11
+ "forced_eos_token_id": 269,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 60413,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ea56a7b4efbec25875e67b684d1c0bbf00a004adbc51f2f4dfdc57d320ba797
3
+ size 953156520
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bec846ad9846b763c81e69321b2529ae1283a9875c47ef587e29fb377dc1350b
3
+ size 953207749
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3c59868980cb16fc1ae3237239299160f2a43ecd20da957276d4d1a33019e2
3
+ size 773320
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98706fed6fcb8c2973da9e827f8f5af778b107a15ed8394c47dc5ad2ffaf7386
3
+ size 814021
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "poz", "target_lang": "deu+eng+fra+por+spa", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30/poz-deu+eng+fra+por+spa", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff