tiedeman commited on
Commit
9354b3f
1 Parent(s): 3c2340e

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,761 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - br
5
+ - cy
6
+ - de
7
+ - en
8
+ - es
9
+ - fr
10
+ - ga
11
+ - gd
12
+ - gv
13
+ - kw
14
+ - pt
15
+
16
+ tags:
17
+ - translation
18
+ - opus-mt-tc-bible
19
+
20
+ license: apache-2.0
21
+ model-index:
22
+ - name: opus-mt-tc-bible-big-cel-deu_eng_fra_por_spa
23
+ results:
24
+ - task:
25
+ name: Translation cym-deu
26
+ type: translation
27
+ args: cym-deu
28
+ dataset:
29
+ name: flores200-devtest
30
+ type: flores200-devtest
31
+ args: cym-deu
32
+ metrics:
33
+ - name: BLEU
34
+ type: bleu
35
+ value: 22.6
36
+ - name: chr-F
37
+ type: chrf
38
+ value: 0.52745
39
+ - task:
40
+ name: Translation cym-eng
41
+ type: translation
42
+ args: cym-eng
43
+ dataset:
44
+ name: flores200-devtest
45
+ type: flores200-devtest
46
+ args: cym-eng
47
+ metrics:
48
+ - name: BLEU
49
+ type: bleu
50
+ value: 55.5
51
+ - name: chr-F
52
+ type: chrf
53
+ value: 0.75234
54
+ - task:
55
+ name: Translation cym-fra
56
+ type: translation
57
+ args: cym-fra
58
+ dataset:
59
+ name: flores200-devtest
60
+ type: flores200-devtest
61
+ args: cym-fra
62
+ metrics:
63
+ - name: BLEU
64
+ type: bleu
65
+ value: 31.4
66
+ - name: chr-F
67
+ type: chrf
68
+ value: 0.58339
69
+ - task:
70
+ name: Translation cym-por
71
+ type: translation
72
+ args: cym-por
73
+ dataset:
74
+ name: flores200-devtest
75
+ type: flores200-devtest
76
+ args: cym-por
77
+ metrics:
78
+ - name: BLEU
79
+ type: bleu
80
+ value: 18.3
81
+ - name: chr-F
82
+ type: chrf
83
+ value: 0.47566
84
+ - task:
85
+ name: Translation cym-spa
86
+ type: translation
87
+ args: cym-spa
88
+ dataset:
89
+ name: flores200-devtest
90
+ type: flores200-devtest
91
+ args: cym-spa
92
+ metrics:
93
+ - name: BLEU
94
+ type: bleu
95
+ value: 19.9
96
+ - name: chr-F
97
+ type: chrf
98
+ value: 0.48834
99
+ - task:
100
+ name: Translation gla-deu
101
+ type: translation
102
+ args: gla-deu
103
+ dataset:
104
+ name: flores200-devtest
105
+ type: flores200-devtest
106
+ args: gla-deu
107
+ metrics:
108
+ - name: BLEU
109
+ type: bleu
110
+ value: 13.0
111
+ - name: chr-F
112
+ type: chrf
113
+ value: 0.41962
114
+ - task:
115
+ name: Translation gla-eng
116
+ type: translation
117
+ args: gla-eng
118
+ dataset:
119
+ name: flores200-devtest
120
+ type: flores200-devtest
121
+ args: gla-eng
122
+ metrics:
123
+ - name: BLEU
124
+ type: bleu
125
+ value: 26.4
126
+ - name: chr-F
127
+ type: chrf
128
+ value: 0.53374
129
+ - task:
130
+ name: Translation gla-fra
131
+ type: translation
132
+ args: gla-fra
133
+ dataset:
134
+ name: flores200-devtest
135
+ type: flores200-devtest
136
+ args: gla-fra
137
+ metrics:
138
+ - name: BLEU
139
+ type: bleu
140
+ value: 16.6
141
+ - name: chr-F
142
+ type: chrf
143
+ value: 0.44916
144
+ - task:
145
+ name: Translation gla-por
146
+ type: translation
147
+ args: gla-por
148
+ dataset:
149
+ name: flores200-devtest
150
+ type: flores200-devtest
151
+ args: gla-por
152
+ metrics:
153
+ - name: BLEU
154
+ type: bleu
155
+ value: 12.1
156
+ - name: chr-F
157
+ type: chrf
158
+ value: 0.39790
159
+ - task:
160
+ name: Translation gla-spa
161
+ type: translation
162
+ args: gla-spa
163
+ dataset:
164
+ name: flores200-devtest
165
+ type: flores200-devtest
166
+ args: gla-spa
167
+ metrics:
168
+ - name: BLEU
169
+ type: bleu
170
+ value: 12.9
171
+ - name: chr-F
172
+ type: chrf
173
+ value: 0.40375
174
+ - task:
175
+ name: Translation gle-deu
176
+ type: translation
177
+ args: gle-deu
178
+ dataset:
179
+ name: flores200-devtest
180
+ type: flores200-devtest
181
+ args: gle-deu
182
+ metrics:
183
+ - name: BLEU
184
+ type: bleu
185
+ value: 19.2
186
+ - name: chr-F
187
+ type: chrf
188
+ value: 0.49962
189
+ - task:
190
+ name: Translation gle-eng
191
+ type: translation
192
+ args: gle-eng
193
+ dataset:
194
+ name: flores200-devtest
195
+ type: flores200-devtest
196
+ args: gle-eng
197
+ metrics:
198
+ - name: BLEU
199
+ type: bleu
200
+ value: 38.9
201
+ - name: chr-F
202
+ type: chrf
203
+ value: 0.64866
204
+ - task:
205
+ name: Translation gle-fra
206
+ type: translation
207
+ args: gle-fra
208
+ dataset:
209
+ name: flores200-devtest
210
+ type: flores200-devtest
211
+ args: gle-fra
212
+ metrics:
213
+ - name: BLEU
214
+ type: bleu
215
+ value: 26.7
216
+ - name: chr-F
217
+ type: chrf
218
+ value: 0.54564
219
+ - task:
220
+ name: Translation gle-por
221
+ type: translation
222
+ args: gle-por
223
+ dataset:
224
+ name: flores200-devtest
225
+ type: flores200-devtest
226
+ args: gle-por
227
+ metrics:
228
+ - name: BLEU
229
+ type: bleu
230
+ value: 14.9
231
+ - name: chr-F
232
+ type: chrf
233
+ value: 0.44768
234
+ - task:
235
+ name: Translation gle-spa
236
+ type: translation
237
+ args: gle-spa
238
+ dataset:
239
+ name: flores200-devtest
240
+ type: flores200-devtest
241
+ args: gle-spa
242
+ metrics:
243
+ - name: BLEU
244
+ type: bleu
245
+ value: 18.7
246
+ - name: chr-F
247
+ type: chrf
248
+ value: 0.47347
249
+ - task:
250
+ name: Translation cym-deu
251
+ type: translation
252
+ args: cym-deu
253
+ dataset:
254
+ name: flores101-devtest
255
+ type: flores_101
256
+ args: cym deu devtest
257
+ metrics:
258
+ - name: BLEU
259
+ type: bleu
260
+ value: 22.4
261
+ - name: chr-F
262
+ type: chrf
263
+ value: 0.52672
264
+ - task:
265
+ name: Translation cym-fra
266
+ type: translation
267
+ args: cym-fra
268
+ dataset:
269
+ name: flores101-devtest
270
+ type: flores_101
271
+ args: cym fra devtest
272
+ metrics:
273
+ - name: BLEU
274
+ type: bleu
275
+ value: 31.3
276
+ - name: chr-F
277
+ type: chrf
278
+ value: 0.58299
279
+ - task:
280
+ name: Translation cym-por
281
+ type: translation
282
+ args: cym-por
283
+ dataset:
284
+ name: flores101-devtest
285
+ type: flores_101
286
+ args: cym por devtest
287
+ metrics:
288
+ - name: BLEU
289
+ type: bleu
290
+ value: 18.4
291
+ - name: chr-F
292
+ type: chrf
293
+ value: 0.47733
294
+ - task:
295
+ name: Translation gle-eng
296
+ type: translation
297
+ args: gle-eng
298
+ dataset:
299
+ name: flores101-devtest
300
+ type: flores_101
301
+ args: gle eng devtest
302
+ metrics:
303
+ - name: BLEU
304
+ type: bleu
305
+ value: 38.6
306
+ - name: chr-F
307
+ type: chrf
308
+ value: 0.64773
309
+ - task:
310
+ name: Translation gle-fra
311
+ type: translation
312
+ args: gle-fra
313
+ dataset:
314
+ name: flores101-devtest
315
+ type: flores_101
316
+ args: gle fra devtest
317
+ metrics:
318
+ - name: BLEU
319
+ type: bleu
320
+ value: 26.5
321
+ - name: chr-F
322
+ type: chrf
323
+ value: 0.54559
324
+ - task:
325
+ name: Translation cym-deu
326
+ type: translation
327
+ args: cym-deu
328
+ dataset:
329
+ name: ntrex128
330
+ type: ntrex128
331
+ args: cym-deu
332
+ metrics:
333
+ - name: BLEU
334
+ type: bleu
335
+ value: 16.3
336
+ - name: chr-F
337
+ type: chrf
338
+ value: 0.46627
339
+ - task:
340
+ name: Translation cym-eng
341
+ type: translation
342
+ args: cym-eng
343
+ dataset:
344
+ name: ntrex128
345
+ type: ntrex128
346
+ args: cym-eng
347
+ metrics:
348
+ - name: BLEU
349
+ type: bleu
350
+ value: 40.0
351
+ - name: chr-F
352
+ type: chrf
353
+ value: 0.65343
354
+ - task:
355
+ name: Translation cym-fra
356
+ type: translation
357
+ args: cym-fra
358
+ dataset:
359
+ name: ntrex128
360
+ type: ntrex128
361
+ args: cym-fra
362
+ metrics:
363
+ - name: BLEU
364
+ type: bleu
365
+ value: 23.8
366
+ - name: chr-F
367
+ type: chrf
368
+ value: 0.51183
369
+ - task:
370
+ name: Translation cym-por
371
+ type: translation
372
+ args: cym-por
373
+ dataset:
374
+ name: ntrex128
375
+ type: ntrex128
376
+ args: cym-por
377
+ metrics:
378
+ - name: BLEU
379
+ type: bleu
380
+ value: 14.4
381
+ - name: chr-F
382
+ type: chrf
383
+ value: 0.42857
384
+ - task:
385
+ name: Translation cym-spa
386
+ type: translation
387
+ args: cym-spa
388
+ dataset:
389
+ name: ntrex128
390
+ type: ntrex128
391
+ args: cym-spa
392
+ metrics:
393
+ - name: BLEU
394
+ type: bleu
395
+ value: 25.0
396
+ - name: chr-F
397
+ type: chrf
398
+ value: 0.51542
399
+ - task:
400
+ name: Translation gle-deu
401
+ type: translation
402
+ args: gle-deu
403
+ dataset:
404
+ name: ntrex128
405
+ type: ntrex128
406
+ args: gle-deu
407
+ metrics:
408
+ - name: BLEU
409
+ type: bleu
410
+ value: 15.5
411
+ - name: chr-F
412
+ type: chrf
413
+ value: 0.46495
414
+ - task:
415
+ name: Translation gle-eng
416
+ type: translation
417
+ args: gle-eng
418
+ dataset:
419
+ name: ntrex128
420
+ type: ntrex128
421
+ args: gle-eng
422
+ metrics:
423
+ - name: BLEU
424
+ type: bleu
425
+ value: 33.5
426
+ - name: chr-F
427
+ type: chrf
428
+ value: 0.60913
429
+ - task:
430
+ name: Translation gle-fra
431
+ type: translation
432
+ args: gle-fra
433
+ dataset:
434
+ name: ntrex128
435
+ type: ntrex128
436
+ args: gle-fra
437
+ metrics:
438
+ - name: BLEU
439
+ type: bleu
440
+ value: 20.7
441
+ - name: chr-F
442
+ type: chrf
443
+ value: 0.49513
444
+ - task:
445
+ name: Translation gle-por
446
+ type: translation
447
+ args: gle-por
448
+ dataset:
449
+ name: ntrex128
450
+ type: ntrex128
451
+ args: gle-por
452
+ metrics:
453
+ - name: BLEU
454
+ type: bleu
455
+ value: 13.2
456
+ - name: chr-F
457
+ type: chrf
458
+ value: 0.41767
459
+ - task:
460
+ name: Translation gle-spa
461
+ type: translation
462
+ args: gle-spa
463
+ dataset:
464
+ name: ntrex128
465
+ type: ntrex128
466
+ args: gle-spa
467
+ metrics:
468
+ - name: BLEU
469
+ type: bleu
470
+ value: 23.6
471
+ - name: chr-F
472
+ type: chrf
473
+ value: 0.50755
474
+ - task:
475
+ name: Translation bre-eng
476
+ type: translation
477
+ args: bre-eng
478
+ dataset:
479
+ name: tatoeba-test-v2021-08-07
480
+ type: tatoeba_mt
481
+ args: bre-eng
482
+ metrics:
483
+ - name: BLEU
484
+ type: bleu
485
+ value: 35.0
486
+ - name: chr-F
487
+ type: chrf
488
+ value: 0.53473
489
+ - task:
490
+ name: Translation bre-fra
491
+ type: translation
492
+ args: bre-fra
493
+ dataset:
494
+ name: tatoeba-test-v2021-08-07
495
+ type: tatoeba_mt
496
+ args: bre-fra
497
+ metrics:
498
+ - name: BLEU
499
+ type: bleu
500
+ value: 28.3
501
+ - name: chr-F
502
+ type: chrf
503
+ value: 0.49013
504
+ - task:
505
+ name: Translation cym-eng
506
+ type: translation
507
+ args: cym-eng
508
+ dataset:
509
+ name: tatoeba-test-v2021-08-07
510
+ type: tatoeba_mt
511
+ args: cym-eng
512
+ metrics:
513
+ - name: BLEU
514
+ type: bleu
515
+ value: 52.4
516
+ - name: chr-F
517
+ type: chrf
518
+ value: 0.68892
519
+ - task:
520
+ name: Translation gla-eng
521
+ type: translation
522
+ args: gla-eng
523
+ dataset:
524
+ name: tatoeba-test-v2021-08-07
525
+ type: tatoeba_mt
526
+ args: gla-eng
527
+ metrics:
528
+ - name: BLEU
529
+ type: bleu
530
+ value: 23.2
531
+ - name: chr-F
532
+ type: chrf
533
+ value: 0.39607
534
+ - task:
535
+ name: Translation gla-spa
536
+ type: translation
537
+ args: gla-spa
538
+ dataset:
539
+ name: tatoeba-test-v2021-08-07
540
+ type: tatoeba_mt
541
+ args: gla-spa
542
+ metrics:
543
+ - name: BLEU
544
+ type: bleu
545
+ value: 26.1
546
+ - name: chr-F
547
+ type: chrf
548
+ value: 0.51208
549
+ - task:
550
+ name: Translation gle-eng
551
+ type: translation
552
+ args: gle-eng
553
+ dataset:
554
+ name: tatoeba-test-v2021-08-07
555
+ type: tatoeba_mt
556
+ args: gle-eng
557
+ metrics:
558
+ - name: BLEU
559
+ type: bleu
560
+ value: 50.7
561
+ - name: chr-F
562
+ type: chrf
563
+ value: 0.64268
564
+ - task:
565
+ name: Translation multi-multi
566
+ type: translation
567
+ args: multi-multi
568
+ dataset:
569
+ name: tatoeba-test-v2020-07-28-v2023-09-26
570
+ type: tatoeba_mt
571
+ args: multi-multi
572
+ metrics:
573
+ - name: BLEU
574
+ type: bleu
575
+ value: 24.9
576
+ - name: chr-F
577
+ type: chrf
578
+ value: 0.42670
579
+ ---
580
+ # opus-mt-tc-bible-big-cel-deu_eng_fra_por_spa
581
+
582
+ ## Table of Contents
583
+ - [Model Details](#model-details)
584
+ - [Uses](#uses)
585
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
586
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
587
+ - [Training](#training)
588
+ - [Evaluation](#evaluation)
589
+ - [Citation Information](#citation-information)
590
+ - [Acknowledgements](#acknowledgements)
591
+
592
+ ## Model Details
593
+
594
+ Neural machine translation model for translating from Celtic languages (cel) to unknown (deu+eng+fra+por+spa).
595
+
596
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
597
+ **Model Description:**
598
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
599
+ - **Model Type:** Translation (transformer-big)
600
+ - **Release**: 2024-05-30
601
+ - **License:** Apache-2.0
602
+ - **Language(s):**
603
+ - Source Language(s): bre cor cym gla gle glv
604
+ - Target Language(s): deu eng fra por spa
605
+ - Valid Target Language Labels: >>deu<< >>eng<< >>fra<< >>por<< >>spa<< >>xxx<<
606
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/cel-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
607
+ - **Resources for more information:**
608
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/cel-deu%2Beng%2Bfra%2Bpor%2Bspa/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
609
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
610
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
611
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
612
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
613
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
614
+
615
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>deu<<`
616
+
617
+ ## Uses
618
+
619
+ This model can be used for translation and text-to-text generation.
620
+
621
+ ## Risks, Limitations and Biases
622
+
623
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
624
+
625
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
626
+
627
+ ## How to Get Started With the Model
628
+
629
+ A short example code:
630
+
631
+ ```python
632
+ from transformers import MarianMTModel, MarianTokenizer
633
+
634
+ src_text = [
635
+ ">>deu<< Replace this with text in an accepted source language.",
636
+ ">>spa<< This is the second sentence."
637
+ ]
638
+
639
+ model_name = "pytorch-models/opus-mt-tc-bible-big-cel-deu_eng_fra_por_spa"
640
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
641
+ model = MarianMTModel.from_pretrained(model_name)
642
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
643
+
644
+ for t in translated:
645
+ print( tokenizer.decode(t, skip_special_tokens=True) )
646
+ ```
647
+
648
+ You can also use OPUS-MT models with the transformers pipelines, for example:
649
+
650
+ ```python
651
+ from transformers import pipeline
652
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-cel-deu_eng_fra_por_spa")
653
+ print(pipe(">>deu<< Replace this with text in an accepted source language."))
654
+ ```
655
+
656
+ ## Training
657
+
658
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
659
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
660
+ - **Model Type:** transformer-big
661
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/cel-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
662
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
663
+
664
+ ## Evaluation
665
+
666
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/cel-deu%2Beng%2Bfra%2Bpor%2Bspa/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
667
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/cel-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt)
668
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/cel-deu+eng+fra+por+spa/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt)
669
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
670
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
671
+
672
+ | langpair | testset | chr-F | BLEU | #sent | #words |
673
+ |----------|---------|-------|-------|-------|--------|
674
+ | bre-eng | tatoeba-test-v2021-08-07 | 0.53473 | 35.0 | 383 | 2065 |
675
+ | bre-fra | tatoeba-test-v2021-08-07 | 0.49013 | 28.3 | 2494 | 13324 |
676
+ | cym-eng | tatoeba-test-v2021-08-07 | 0.68892 | 52.4 | 818 | 5563 |
677
+ | gla-eng | tatoeba-test-v2021-08-07 | 0.39607 | 23.2 | 955 | 6611 |
678
+ | gla-spa | tatoeba-test-v2021-08-07 | 0.51208 | 26.1 | 289 | 1608 |
679
+ | gle-eng | tatoeba-test-v2021-08-07 | 0.64268 | 50.7 | 1913 | 11190 |
680
+ | cym-deu | flores101-devtest | 0.52672 | 22.4 | 1012 | 25094 |
681
+ | cym-fra | flores101-devtest | 0.58299 | 31.3 | 1012 | 28343 |
682
+ | cym-por | flores101-devtest | 0.47733 | 18.4 | 1012 | 26519 |
683
+ | gle-eng | flores101-devtest | 0.64773 | 38.6 | 1012 | 24721 |
684
+ | gle-fra | flores101-devtest | 0.54559 | 26.5 | 1012 | 28343 |
685
+ | cym-deu | flores200-devtest | 0.52745 | 22.6 | 1012 | 25094 |
686
+ | cym-eng | flores200-devtest | 0.75234 | 55.5 | 1012 | 24721 |
687
+ | cym-fra | flores200-devtest | 0.58339 | 31.4 | 1012 | 28343 |
688
+ | cym-por | flores200-devtest | 0.47566 | 18.3 | 1012 | 26519 |
689
+ | cym-spa | flores200-devtest | 0.48834 | 19.9 | 1012 | 29199 |
690
+ | gla-deu | flores200-devtest | 0.41962 | 13.0 | 1012 | 25094 |
691
+ | gla-eng | flores200-devtest | 0.53374 | 26.4 | 1012 | 24721 |
692
+ | gla-fra | flores200-devtest | 0.44916 | 16.6 | 1012 | 28343 |
693
+ | gla-spa | flores200-devtest | 0.40375 | 12.9 | 1012 | 29199 |
694
+ | gle-deu | flores200-devtest | 0.49962 | 19.2 | 1012 | 25094 |
695
+ | gle-eng | flores200-devtest | 0.64866 | 38.9 | 1012 | 24721 |
696
+ | gle-fra | flores200-devtest | 0.54564 | 26.7 | 1012 | 28343 |
697
+ | gle-por | flores200-devtest | 0.44768 | 14.9 | 1012 | 26519 |
698
+ | gle-spa | flores200-devtest | 0.47347 | 18.7 | 1012 | 29199 |
699
+ | cym-deu | ntrex128 | 0.46627 | 16.3 | 1997 | 48761 |
700
+ | cym-eng | ntrex128 | 0.65343 | 40.0 | 1997 | 47673 |
701
+ | cym-fra | ntrex128 | 0.51183 | 23.8 | 1997 | 53481 |
702
+ | cym-por | ntrex128 | 0.42857 | 14.4 | 1997 | 51631 |
703
+ | cym-spa | ntrex128 | 0.51542 | 25.0 | 1997 | 54107 |
704
+ | gle-deu | ntrex128 | 0.46495 | 15.5 | 1997 | 48761 |
705
+ | gle-eng | ntrex128 | 0.60913 | 33.5 | 1997 | 47673 |
706
+ | gle-fra | ntrex128 | 0.49513 | 20.7 | 1997 | 53481 |
707
+ | gle-por | ntrex128 | 0.41767 | 13.2 | 1997 | 51631 |
708
+ | gle-spa | ntrex128 | 0.50755 | 23.6 | 1997 | 54107 |
709
+
710
+ ## Citation Information
711
+
712
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
713
+
714
+ ```bibtex
715
+ @article{tiedemann2023democratizing,
716
+ title={Democratizing neural machine translation with {OPUS-MT}},
717
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
718
+ journal={Language Resources and Evaluation},
719
+ number={58},
720
+ pages={713--755},
721
+ year={2023},
722
+ publisher={Springer Nature},
723
+ issn={1574-0218},
724
+ doi={10.1007/s10579-023-09704-w}
725
+ }
726
+
727
+ @inproceedings{tiedemann-thottingal-2020-opus,
728
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
729
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
730
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
731
+ month = nov,
732
+ year = "2020",
733
+ address = "Lisboa, Portugal",
734
+ publisher = "European Association for Machine Translation",
735
+ url = "https://aclanthology.org/2020.eamt-1.61",
736
+ pages = "479--480",
737
+ }
738
+
739
+ @inproceedings{tiedemann-2020-tatoeba,
740
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
741
+ author = {Tiedemann, J{\"o}rg},
742
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
743
+ month = nov,
744
+ year = "2020",
745
+ address = "Online",
746
+ publisher = "Association for Computational Linguistics",
747
+ url = "https://aclanthology.org/2020.wmt-1.139",
748
+ pages = "1174--1182",
749
+ }
750
+ ```
751
+
752
+ ## Acknowledgements
753
+
754
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
755
+
756
+ ## Model conversion info
757
+
758
+ * transformers version: 4.45.1
759
+ * OPUS-MT git hash: a0ea3b3
760
+ * port time: Mon Oct 7 23:09:42 EEST 2024
761
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ multi-multi tatoeba-test-v2020-07-28-v2023-09-26 0.42670 24.9 10000 57351
2
+ cym-deu flores101-devtest 0.52672 22.4 1012 25094
3
+ cym-fra flores101-devtest 0.58299 31.3 1012 28343
4
+ cym-por flores101-devtest 0.47733 18.4 1012 26519
5
+ gle-eng flores101-devtest 0.64773 38.6 1012 24721
6
+ gle-fra flores101-devtest 0.54559 26.5 1012 28343
7
+ cym-deu flores200-devtest 0.52745 22.6 1012 25094
8
+ cym-eng flores200-devtest 0.75234 55.5 1012 24721
9
+ cym-fra flores200-devtest 0.58339 31.4 1012 28343
10
+ cym-por flores200-devtest 0.47566 18.3 1012 26519
11
+ cym-spa flores200-devtest 0.48834 19.9 1012 29199
12
+ gla-deu flores200-devtest 0.41962 13.0 1012 25094
13
+ gla-eng flores200-devtest 0.53374 26.4 1012 24721
14
+ gla-fra flores200-devtest 0.44916 16.6 1012 28343
15
+ gla-por flores200-devtest 0.39790 12.1 1012 26519
16
+ gla-spa flores200-devtest 0.40375 12.9 1012 29199
17
+ gle-deu flores200-devtest 0.49962 19.2 1012 25094
18
+ gle-eng flores200-devtest 0.64866 38.9 1012 24721
19
+ gle-fra flores200-devtest 0.54564 26.7 1012 28343
20
+ gle-por flores200-devtest 0.44768 14.9 1012 26519
21
+ gle-spa flores200-devtest 0.47347 18.7 1012 29199
22
+ cym-deu ntrex128 0.46627 16.3 1997 48761
23
+ cym-eng ntrex128 0.65343 40.0 1997 47673
24
+ cym-fra ntrex128 0.51183 23.8 1997 53481
25
+ cym-por ntrex128 0.42857 14.4 1997 51631
26
+ cym-spa ntrex128 0.51542 25.0 1997 54107
27
+ gle-deu ntrex128 0.46495 15.5 1997 48761
28
+ gle-eng ntrex128 0.60913 33.5 1997 47673
29
+ gle-fra ntrex128 0.49513 20.7 1997 53481
30
+ gle-por ntrex128 0.41767 13.2 1997 51631
31
+ gle-spa ntrex128 0.50755 23.6 1997 54107
32
+ cor-fra tatoeba-test-v2020-07-28 0.24652 6.2 567 3136
33
+ gla-eng tatoeba-test-v2020-07-28 0.40979 25.3 917 6366
34
+ gle-eng tatoeba-test-v2020-07-28 0.64935 51.8 1924 11247
35
+ bre-eng tatoeba-test-v2021-03-30 0.53219 34.3 385 2091
36
+ bre-fra tatoeba-test-v2021-03-30 0.49675 28.8 2500 13343
37
+ cor-deu tatoeba-test-v2021-03-30 0.24298 6.8 822 4682
38
+ cor-fra tatoeba-test-v2021-03-30 0.24669 6.2 568 3142
39
+ cor-spa tatoeba-test-v2021-03-30 0.21930 4.5 207 1085
40
+ gla-eng tatoeba-test-v2021-03-30 0.41147 25.6 957 6628
41
+ gla-spa tatoeba-test-v2021-03-30 0.49577 24.6 290 1611
42
+ gle-eng tatoeba-test-v2021-03-30 0.64935 51.8 1924 11247
43
+ bre-eng tatoeba-test-v2021-08-07 0.53473 35.0 383 2065
44
+ bre-fra tatoeba-test-v2021-08-07 0.49013 28.3 2494 13324
45
+ cor-deu tatoeba-test-v2021-08-07 0.24055 6.5 821 4676
46
+ cor-eng tatoeba-test-v2021-08-07 0.19002 4.9 3198 16829
47
+ cor-fra tatoeba-test-v2021-08-07 0.24494 6.4 555 3092
48
+ cor-spa tatoeba-test-v2021-08-07 0.22170 4.7 206 1080
49
+ cym-eng tatoeba-test-v2021-08-07 0.68892 52.4 818 5563
50
+ gla-eng tatoeba-test-v2021-08-07 0.39607 23.2 955 6611
51
+ gla-spa tatoeba-test-v2021-08-07 0.51208 26.1 289 1608
52
+ gle-eng tatoeba-test-v2021-08-07 0.64268 50.7 1913 11190
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-cel-deu_eng_fra_por_spa",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 56598,
17
+ "decoder_vocab_size": 56599,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 574,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 56598,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 56599
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 56598
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 56598,
10
+ "eos_token_id": 574,
11
+ "forced_eos_token_id": 574,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 56598,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f13b0f3fd4fa985bb0809140b29431083e4178945d8ffa382cd367a3a6c08dd3
3
+ size 937515020
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ba5d8557a1cc2b826582f7042c281e64cd05cbeebc21dfa3efb23dcbb2f30f
3
+ size 937566277
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d47a98f51d3b3ef3a0bdb9ab022bbbdcf14f45dde7dec4ab910e2baab534b0
3
+ size 801982
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70212b4c82db279a975546bac06c313785735f0a78e534aedcb9a297293af9f0
3
+ size 800773
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "cel", "target_lang": "deu+eng+fra+por+spa", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30/cel-deu+eng+fra+por+spa", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff