add model artifacts

Browse files

Files changed (7) hide show

README.md +58 -2
adapter_config.json +25 -0
adapter_model.bin +3 -0
added_tokens.json +3 -0
special_tokens_map.json +6 -0
tokenizer.model +3 -0
tokenizer_config.json +36 -0

README.md CHANGED Viewed

@@ -24,7 +24,7 @@ library_name: transformers
 # Model Card for mEdIT-xxl
-This model was obtained by fine-tuning the `MBZUAI/bactrian-x-llama-13b-lora` model on the mEdIT dataset.
 **Paper:** mEdIT: Multilingual Text Editing via Instruction Tuning
@@ -43,4 +43,60 @@ This model was obtained by fine-tuning the `MBZUAI/bactrian-x-llama-13b-lora` mo
 - **Paper:** TBA
 ## How to use
-We release the best-performing models presented in our paper.

 # Model Card for mEdIT-xxl
+The `medit-xxl` model was obtained by fine-tuning the `MBZUAI/bactrian-x-llama-13b-lora` model on the mEdIT dataset.
 **Paper:** mEdIT: Multilingual Text Editing via Instruction Tuning
 - **Paper:** TBA
 ## How to use
+### Instruction format
+Adherence to the following instruction format is essential; failure to do so may result in the model producing less-than-ideal results.
+```
+instruction_tokens = [
+    "Instruction",
+    "Anweisung",
+    ...
+]
+input_tokens = [
+    "Input",
+    "Aporte",
+    ...
+]
+output_tokens = [
+    "Output",
+    "Produzione",
+    ...
+]
+task_descriptions = [
+    "Fix grammatical errors in this sentence",  # <-- GEC task
+    "Umschreiben Sie den Satz",                 # <-- Paraphrasing
+    ...
+]
+The entire list of possible instruction, input, output tokens, and task descriptions can be found in the Appendix of our paper.
+prompt_template = """### <instruction_token>:\n<task description>\n### <input_token>:\n<input>\n### <output_token>:\n\n"""
+Note that the tokens and the task description need not be in the language of the input.
+```
+### Run the model
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+model_id = "grammarly/medit-xxl"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+prompt = '### 命令:\n文章を文法的にする\n### 入力:\nDear Sir ,\n### 出力:\n\n'
+inputs = tokenizer(prompt, return_tensors='pt')
+outputs = model.generate(**inputs, max_new_tokens=20)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True)
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "MBZUAI/bactrian-x-llama-13b-merged",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e4fab47b3c5a601c9654b7e86324afc2371d233aad3ffe1702f711e47820f73
+size 26329549

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[PAD]": 32000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "[PAD]",
+  "unk_token": "<unk>"
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "use_default_system_prompt": true
+}