aarontseng commited on
Commit
7eda9aa
·
verified ·
1 Parent(s): 6855334

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +76 -3
README.md CHANGED
@@ -1,3 +1,76 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - zh
5
+ - en
6
+ pipeline_tag: translation
7
+ tags:
8
+ - text2text-generation
9
+ - translation
10
+ ---
11
+
12
+ ## How to use
13
+
14
+ ```
15
+ git lfs install
16
+ git clone https://huggingface.co/saivanlab/saivan-mt-en-zh
17
+ ```
18
+
19
+ ```
20
+ pip install ctranslate2
21
+ pip install sentencepiece
22
+ ```
23
+
24
+ ## Basic Usage
25
+
26
+ ```
27
+ import ctranslate2
28
+ import sentencepiece
29
+
30
+ src_model = sentencepiece.SentencePieceProcessor()
31
+ src_model.load("saivan-mt-en-zh/source.model")
32
+ tgt_model = sentencepiece.SentencePieceProcessor()
33
+ tgt_model.load("saivan-mt-en-zh/target.model")
34
+
35
+ translator = ctranslate2.Translator("saivan-mt-en-zh", device="cuda") # "cpu" or "cuda"
36
+
37
+ encoded_line = src_model.encode_as_pieces("input text")
38
+
39
+ results = translator.translate_batch([encoded_line], batch_type="tokens", max_batch_size=64)
40
+
41
+ decoded_line = tgt_model.decode(results[0].hypotheses[0])
42
+
43
+ print(decoded_line)
44
+ ```
45
+
46
+ ## Batch translation
47
+ ```
48
+ import ctranslate2
49
+ import sentencepiece
50
+
51
+ src_path = "input.txt"
52
+ tgt_path = "output.txt"
53
+
54
+ src_model = sentencepiece.SentencePieceProcessor()
55
+ src_model.load("saivan-mt-en-zh/source.model")
56
+ tgt_model = sentencepiece.SentencePieceProcessor()
57
+ tgt_model.load("saivan-mt-en-zh/target.model")
58
+
59
+ translator = ctranslate2.Translator("saivan-mt-en-zh", device="cuda") # "cpu" or "cuda"
60
+
61
+ src_file = open(src_path, 'r', encoding="utf-8")
62
+ src_lines = src_file.readlines()
63
+
64
+ encoded_lines = src_model.encode_as_pieces(src_lines)
65
+
66
+ results = translator.translate_batch(encoded_lines, batch_type="tokens", max_batch_size=1024)
67
+ translations = [translation.hypotheses[0] for translation in results]
68
+
69
+ decoded_lines = tgt_model.decode(translations)
70
+
71
+ tgt_file = open(tgt_path, "w", encoding="utf-8", newline='')
72
+
73
+ for line in decoded_lines:
74
+ tgt_file.write(line)
75
+ tgt_file.write('\n')
76
+ ```