**RUMI-TO-JAWI TRANSLATION MODEL**

References:
https://github.com/ken11/mbart-finetuning/blob/master/mbart-finetuning.ipynb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Latest Code**

In [None]:
!pip install torch transformers datasets numpy pandas sentencepiece

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install fairseq==0.6.2



In [None]:
from transformers import MBartForConditionalGeneration, MBartTokenizer, MBartConfig

In [None]:
from datasets import load_from_disk

**Preparing the Dataset**

In [None]:
from datasets import load_dataset
dataset = load_dataset("mesolitica/rumi-jawi-instructions")

In [None]:
print(dataset)
print(dataset['train'][0])  # Display the first sample

DatasetDict({
    train: Dataset({
        features: ['prompt_input', 'input', 'output'],
        num_rows: 1499594
    })
})
{'prompt_input': None, 'input': 'tukar ke rumi: بوين تنجوڠ مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.', 'output': 'Boyan Tanjung merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.'}


In [None]:
dataset = dataset.remove_columns(['prompt_input'])

**Need to take note here. The dataset should be splitted into two new datasets**

This is because mesolitica datasets combined both 'tukar ke rumi' and 'tukar ke jawi' datasets.

Which affecting the training process.

In [None]:
# Split the data into two subsets
jawi_to_rumi = dataset.filter(lambda example: example['input'].startswith("tukar ke rumi"))
rumi_to_jawi = dataset.filter(lambda example: example['input'].startswith("tukar ke jawi"))

# Save the subsets to disk
jawi_to_rumi.save_to_disk("jawi_to_rumi_split")
rumi_to_jawi.save_to_disk("rumi_to_jawi_split")

print("Dataset split completed.")

Filter:   0%|          | 0/1499594 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1499594 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/749797 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/749797 [00:00<?, ? examples/s]

Dataset split completed.


**Here, we only want the 'Rumi to Jawi' datasets**

then, we will start cleaning the datasets.

In [None]:
dataset = load_from_disk("/content/rumi_to_jawi_split")

print(dataset)
print(dataset['train'][0])  # Display the first sample

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 749797
    })
})
{'input': 'tukar ke jawi: Boyan Tanjung merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.', 'output': 'بوين تنجوڠ مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.'}


In [None]:
# Remove the prefix "tukar ke jawi: " from the 'input' column
def remove_prefix(example):
    example["input"] = example["input"].replace("tukar ke jawi: ", "")
    return example

In [None]:
# Apply the transformation to the dataset
dataset = dataset.map(remove_prefix)

Map:   0%|          | 0/749797 [00:00<?, ? examples/s]

In [None]:
print(dataset)
print(dataset['train'][0])  # Display the first sample

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 749797
    })
})
{'input': 'Boyan Tanjung merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.', 'output': 'بوين تنجوڠ مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.'}


In [None]:
# Access the 'train' split
train_dataset = dataset['train']

# Open a file in write mode
with open('output_dataset.txt', 'w', encoding='utf-8') as file:
    # Iterate over each example in the dataset
    for example in train_dataset:
        # Combine 'output' and 'input' with a tab separator
        combined_line = f"{example['input']}\t{example['output']}\n"
        # Write the combined line to the file
        file.write(combined_line)

print(f"Dataset saved!")

Dataset saved!


**If using output.txt from the Google Drive**

In [None]:
# Read the dataset from a text file
input_path = "/content/drive/MyDrive/Education/Final Year Project I/Cleaned Dataset/Datasets/output_dataset.txt"
output_path = "/content/output_dataset.txt"

# Read the data from the text file (assuming each line is a separate entry)
with open(input_path, 'r') as f:
    dataset = f.readlines()

# Optionally, you can process the dataset if needed
# For example, you can strip whitespace and process each line
processed_dataset = [line.strip() for line in dataset]

# Save the processed dataset back to the runtime storage as a new .txt file
with open(output_path, 'w') as f:
    for item in processed_dataset:
        f.write(item + '\n')

# Check if the dataset was saved correctly
print(f"Dataset has been saved to {output_path}")


Dataset has been saved to /content/output_dataset.txt


In [None]:
!tail output_dataset.txt

Chibuto merupakan sebuah bandar yang terletak di Daerah Chibuto, Wilayah Gaza, Mozambique. Koordinat bandar Chibuto adalah 24deg41'S, 33deg32'E. Pada 12 Februari 2008, Kedutaan Besar Amerika Syarikat menghantar pesanan untuk menyarankan komuniti Amerika berhati-hati dengan huru-hara di bandar ini.	چيبوتو مروڤاكن سبواه بندر يڠ ترلتق د دايره چيبوتو، ولايه ڬز، موزامبيق. كواوردينات بندر چيبوتو اداله ٢٤دڬ٤١ءس، ٣٣دڬ٣٢ءاي. ڤد ١٢ فيبرواري ٢٠٠٨، كدوتأن بسر اميريك شريكت مڠهانتر ڤسانن اونتوق مڽارنكن كومونيتي اميريك برهاتي-هاتي دڠن هورو-حرا د بندر اين.
Praha (), juga dikenali sebagai Prague (, ialah ibu negara dan bandar terbesar Republik Czech. Nama rasminya ialah Hlavni mesto Praha , bererti Praha, Ibu Negara . Bandar ini mempunyai penduduk sekitar 1.5 juta orang. Bandar Praha dibelah sungai Vltava di tengah Bohemia. Beberapa bangunan terkenal di bandar ini antara lain adalah Jambatan Charles atau Karluv most dalam bahasa setempat, Istana Praha, Jam Astronomi di Balai Bandar dan Menara Televisye

In [None]:
# Specify the number of lines you want to read
n_lines = 5

file_path = 'output_dataset.txt'

# Open the file and read the first n lines
with open(file_path, 'r', encoding='utf-8') as file:
    for _ in range(n_lines):
        line = file.readline().strip()
        print(line)

Boyan Tanjung merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.	بوين تنجوڠ مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.
Bunut Hilir merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.	بونوت هيلير مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.
Bunut Hulu merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.	بونوت هولو مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.
Embaloh Hilir merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.	امبالوه هيلير مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.
Embaloh Hulu merupakan sebuah kecamatan di Kabupaten Kapuas Hulu, Kalimantan Barat, Indonesia.	امبالوه هولو مروڤاكن سبواه كچامتن د كابوڤاتين كاڤواس هولو، كليمنتن بارت، إندونيسيا.


**Preparing Dev, Train and Test Datasets**

Now, im going to split the 'output_dataset' into dev, train and test datasets and save it in one folder.

In [None]:
import os
import random

# Set the split ratios
train_ratio = 0.8
dev_ratio = 0.1
test_ratio = 0.1

# Read the combined dataset
input_file = "output_dataset.txt"
output_folder = "split_dataset"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Read all lines from the file
with open(input_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Shuffle the dataset to ensure randomness
random.shuffle(lines)

# Calculate the number of lines for each split
total_lines = len(lines)
train_end = int(total_lines * train_ratio)
dev_end = train_end + int(total_lines * dev_ratio)

# Split the dataset
train_lines = lines[:train_end]
dev_lines = lines[train_end:dev_end]
test_lines = lines[dev_end:]

# Write each split to its respective file
splits = {
    "train.txt": train_lines,
    "dev.txt": dev_lines,
    "test.txt": test_lines
}

for split_name, split_lines in splits.items():
    with open(os.path.join(output_folder, split_name), 'w', encoding='utf-8') as split_file:
        split_file.writelines(split_lines)

print(f"Dataset split into train, dev, and test sets and saved in '{output_folder}'.")

Dataset split into train, dev, and test sets and saved in 'split_dataset'.


In [None]:
!tail /content/split_dataset/test.txt

Ketua Pengarah WHO Tedros Adhanom di Geneva, Switzerland. Tujuan dana sambutan adalah untuk menyokong kerja WHO untuk mengesan dan memahami penyebaran virus itu; untuk memastikan pesakit mendapatkan penjagaan yang mereka perlukan	كتوا ڤڠاره وهو تدروس اضحىنم د ڬينۏ، سويتزيرلند. توجوان دانا سمبوتن اداله اونتوق مڽوكوڠ كرجا وهو اونتوق مڠسن دان ممهمي ڤڽيبارن ۏيروس ايت; اونتوق ممستيكن ڤساكيت منداڤتكن ڤنجاڬان يڠ مريك ڤرلوكن
Aldeanueva de Ebro merupakan sebuah bandar dan kawasan perbandaran yang terletak di Sepanyol dalam wilayah La Rioja.	الدانوايۏا د ايبرو مروڤاكن سبواه بندر دان كاوسن ڤربندرن يڠ ترلتق د سيڤڽول دالم ولايه لا ريوجا.
Bremondans ialah komun di jabatan Doubs di Franche-Comte di timur Perancis.	بريموندانس اياله كومون د جابتن دواوبس د فرنچي-چومت د تيمور ڤرانچيس.
Silat Kalimah. Kisah ini bermula apabila ilmu seni Silat Kalimah ini diperturunkan oleh Syeikh Abdullah kepada Sultan Kedah yang dikenali dengan nama	سيلت كلمة. قيصه اين برمولا اڤابيلا علمو سني سيلت كلمة اين دڤرتورونكن اولي

**Create training data for tokenizer**
(Use case)

In [None]:
res = []
for line in open('/content/split_dataset/dev.txt', 'r', encoding='utf-8'):
    text = line.split('\t')
    text = [t.rstrip('\n') for t in text]
    res.extend(text)
for line in open('/content/split_dataset/test.txt', 'r', encoding='utf-8'):
    text = line.split('\t')
    text = [t.rstrip('\n') for t in text]
    res.extend(text)
for line in open('/content/split_dataset/train.txt', 'r', encoding='utf-8'):
    text = line.split('\t')
    text = [t.rstrip('\n') for t in text]
    res.extend(text)

print(len(res))
with open('tmp.txt', 'w') as f:
    for d in res:
        f.write("%s\n" % d)

1499594


In [None]:
!tail tmp.txt

yang lain. Pada 8 Disember 2020, Jabatan Perbendaharaan Amerika Syarikat (di bawah Steven Mnuchin ) menyekat universiti itu kerana didakwa merekrut pelajar untuk berperang di Syria . Pihak universiti juga telah menafikan perkara tersebut.
يڠ لاين. ڤد ٨ دسيمبر ٢٠٢٠، جابتن ڤربنداهاراان اميريك شريكت ﴿د باوه ستيۏين منوچين ﴾ مڽكت اونيۏرسيتي ايت كران ددعوا مركروت ڤلاجر اونتوق برڤراڠ د شريا . ڤيهق اونيۏرسيتي جوڬ تله منافيكن ڤركارا ترسبوت.
(111770) 2002 CY152 ialah sebuah asteroid. Asteroid ini merupakan bahagian dari asteroid Troya Musytari, yang terletak di orbit Musytari. Kesipian orbit asteroid ini tercatat sebesar 0.023, sementara magnitud mutlaknya ialah 12.8. Pembentukan. Seperti asteroid secara keseluruhan, asteroid ini terbentuk dari nebula matahari primordial sebagai pecahan planetisimal, sesuatu di nebula matahari muda yang tidak cukup besar untuk berubah menjadi planet.
﴿١١١٧٧٠﴾ ٢٠٠٢ چي١٥٢ اياله سبواه استرواد. استرواد اين مروڤاكن بهاڬين دري استرواد ترويا مشتاري، يڠ ترلتق د اوربيت م

**Training tokenizer**

In [None]:
import sentencepiece as spm

spm.SentencePieceTrainer.Train("--input=tmp.txt --model_prefix=new_spm_model --vocab_size=64000 --vocabulary_output_piece_score=false --model_type=bpe")

**Download pre-Trained Model**

In [None]:
!wget "https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz"
!tar -zxvf mbart.cc25.v2.tar.gz
!ls mbart.cc25.v2

--2024-12-29 08:45:56--  https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 108.157.254.121, 108.157.254.124, 108.157.254.102, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|108.157.254.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5618826847 (5.2G) [application/gzip]
Saving to: ‘mbart.cc25.v2.tar.gz’


2024-12-29 08:46:20 (222 MB/s) - ‘mbart.cc25.v2.tar.gz’ saved [5618826847/5618826847]

mbart.cc25.v2/
mbart.cc25.v2/sentence.bpe.model
mbart.cc25.v2/dict.txt
mbart.cc25.v2/model.pt
dict.txt  model.pt  sentence.bpe.model


**Formatting vocab**

In [None]:
edited = []
for line in open("new_spm_model.vocab", 'r', encoding='utf-8'):
    if line in ["<unk>\n", "<s>\n", "</s>\n"]:
        continue
    new_line = line.rstrip('\n') + " 1\n"
    edited.append(new_line)

with open('new_dict.txt', 'w') as f:
    for e in edited:
        f.write(e)

In [None]:
!tail new_dict.txt

V 1
ث 1
' 1
Y 1
ط 1
ة 1
ذ 1
x 1
Z 1
q 1


In [None]:
!ls

drive	       mbart.cc25.v2.tar.gz  new_spm_model.model  output_dataset.txt  split_dataset
mbart.cc25.v2  new_dict.txt	     new_spm_model.vocab  sample_data	      tmp.txt


**Reduce to create new model**

In [None]:
!mkdir reduced_model
!ls

drive		      new_dict.txt	   output_dataset.txt  split_dataset
mbart.cc25.v2	      new_spm_model.model  reduced_model       tmp.txt
mbart.cc25.v2.tar.gz  new_spm_model.vocab  sample_data


In [None]:
import numpy as np
np.float = float

from fairseq.data import Dictionary
from transformers import (
    MBartForConditionalGeneration, MBartTokenizer, MBartConfig
)
from typing import List
import torch

In [None]:
langs = [
    "ar_AR",
    "cs_CZ",
    "de_DE",
    "en_XX",
    "es_XX",
    "et_EE",
    "fi_FI",
    "fr_XX",
    "gu_IN",
    "hi_IN",
    "it_IT",
    "ja_XX",
    "kk_KZ",
    "ko_KR",
    "lt_LT",
    "lv_LV",
    "my_MM",
    "ne_NP",
    "nl_XX",
    "ro_RO",
    "ru_RU",
    "si_LK",
    "tr_TR",
    "vi_VN",
    "zh_CN"
]

def load_dict(langs: List[str], path: str) -> Dictionary:
    d = Dictionary.load(path)
    for ll in langs:
        d.add_symbol(f"[{ll}]")
    d.add_symbol("<mask>")
    d.add_symbol("<pad>")
    return d


pre_dict = load_dict(langs, "./mbart.cc25.v2/dict.txt")
ft_dict = load_dict(langs, "./new_dict.txt")

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
org_sd = model.state_dict()
resized_sd = model.state_dict()

mapping: List[int] = []
for i in range(len(ft_dict)):
    word = ft_dict[i]
    mapping.append(pre_dict.index(word))

for name in ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "model.shared.weight", "lm_head.weight"]:
    pre_tensor: torch.Tensor = org_sd[name]
    ft_tensor = torch.zeros(
        [len(ft_dict), 1024], dtype=pre_tensor.dtype, layout=pre_tensor.layout, device=pre_tensor.device,
    )
    for ft_i, pre_i in enumerate(mapping):
        ft_tensor[ft_i] = pre_tensor[pre_i]
    resized_sd[name] = ft_tensor
resized_sd["final_logits_bias"] = resized_sd["final_logits_bias"][:, :len(ft_dict)]

config = MBartConfig.from_pretrained("facebook/mbart-large-cc25")
config.vocab_size = len(ft_dict)
print(config)
new_model = MBartForConditionalGeneration.from_pretrained(None, config=config, state_dict=resized_sd)
new_model.save_pretrained("./reduced_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co./settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

MBartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_length": 1024,
  "max_position_embeddings": 1024,
  "model_type": "mbart",
  "normalize_before": true,
  "normalize_embedding": true,
  "num_beams": 5,
  "nu



In [None]:
!ls reduced_model

config.json  generation_config.json  model.safetensors


**Preparation of Tokenizer**

In [None]:
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
tokenizer.save_pretrained("./reduced_model")

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

('./reduced_model/tokenizer_config.json',
 './reduced_model/special_tokens_map.json',
 './reduced_model/sentencepiece.bpe.model',
 './reduced_model/added_tokens.json')

In [None]:
!mv ./new_spm_model.model ./reduced_model/sentencepiece.bpe.model

mv: cannot stat './new_spm_model.model': No such file or directory


In [None]:
!ls -al ./reduced_model

total 1643884
drwxr-xr-x 2 root root       4096 Dec 29 08:52 .
drwxr-xr-x 1 root root       4096 Dec 29 08:52 ..
-rw-r--r-- 1 root root       1367 Dec 29 08:50 config.json
-rw-r--r-- 1 root root        200 Dec 29 08:50 generation_config.json
-rw-r--r-- 1 root root 1681867980 Dec 29 08:50 model.safetensors
-rw-r--r-- 1 root root    1431242 Dec 29 08:48 sentencepiece.bpe.model
-rw-r--r-- 1 root root        642 Dec 29 08:52 special_tokens_map.json
-rw-r--r-- 1 root root       5931 Dec 29 08:52 tokenizer_config.json


In [None]:
model = MBartForConditionalGeneration.from_pretrained("./reduced_model")
tokenizer = MBartTokenizer.from_pretrained("./reduced_model")

**Training**

In [None]:
from transformers import (
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import numpy as np
import re

result_dir = "/content/drive/MyDrive/Education/Final Year Project I/Output"

In [None]:
tokenizer = MBartTokenizer.from_pretrained("./reduced_model", src_lang="en_XX", tgt_lang="ar_AR")
tokenizer.save_pretrained(result_dir)

('/content/drive/MyDrive/Education/Final Year Project I/Output/tokenizer_config.json',
 '/content/drive/MyDrive/Education/Final Year Project I/Output/special_tokens_map.json',
 '/content/drive/MyDrive/Education/Final Year Project I/Output/sentencepiece.bpe.model',
 '/content/drive/MyDrive/Education/Final Year Project I/Output/added_tokens.json')

**[Fixed] Data Collator [This one caused Zero in Training Loss and Eval Loss]**

*For now, this one is working even though still returns zero loss.*

In [None]:
def data_collator(features: list):
    x = [f.get("translation", {}).get("my", "") for f in features]
    y = [f.get("translation", {}).get("jw", "") for f in features]

    inputs = tokenizer(x, return_tensors="pt", padding='max_length', truncation=True, max_length=32)

    with tokenizer.as_target_tokenizer():
        inputs['labels'] = tokenizer(y, return_tensors="pt", padding='max_length', truncation=True, max_length=48)['input_ids']

    return inputs

In [None]:
# Sample datasets
features = [
    {'translation': {'my': 'Euro 2020. Dia membuat penampilan sulung pada 2 Jun 2021 dalam perlawanan persahabatan menentang Wales, menggantikan Benjamin Pavard pada separuh masa pertama. Kehidupan', 'jw': 'اييورو ٢٠٢٠. دي ممبوات ڤنمڤيلن سولوڠ ڤد ٢ جون ٢٠٢١ دالم ڤرلاونن ڤرساهابتن مننتڠ ولايس، مڠڬنتيكن بينجامين ڤاۏارد ڤد سيڤروح ماس ڤرتام. كهيدوڤن'}},
    {'translation': {'my': 'Campora merupakan sebuah komun dan bandar yang terletak di Salerno di Campania dalam kawasan Itali.', 'jw': 'چمڤور مروڤاكن سبواه كومون دان بندر يڠ ترلتق د ساليرنو د چمڤنيا دالم كاوسن إيطاليا.'}},
    {'translation': {'my': 'cara mengekalkan makanan (lazimnya ikan) dengan menggunakan garam (atau asam) yang banyak dan mampu untuk menghasilkan pekasam yang mampu bertahan lebih lama', 'jw': 'چارا مڠكلكن ماكنن ﴿لازيمڽ ايكن﴾ دڠن مڠڬوناكن ڬارام ﴿اتاو اسم﴾ يڠ باڽق دان ممڤو اونتوق مڠحاصيلكن ڤيكاسم يڠ ممڤو برتاهن لبيه لاما'}},
    {'translation': {'my': 'dan TV12 ( TV12 Singapura ) di bawah nama STV12 ( Singapore Television Twelve ) mengendalikan pengurus Saluran 12 oleh Perdana 12 dan Premiere 12 dinamakan semula MediaCorp TV12', 'jw': 'دان تۏ١٢ ﴿ تۏ١٢ سيڠاڤورا ﴾ د باوه نام ستۏ١٢ ﴿ سيڠاڤوري تيليۏيسياون تويلۏي ﴾ مڠنداليكن ڤڠوروس سالورن ١٢ اوليه ڤردان ١٢ دان ڤريمياراي ١٢ ديناماكن سمولا ميدياچورڤ تۏ١٢'}},
    {'translation': {'my': 'aras laut..', 'jw': 'ارس لاوت..'}},
    ]

x = [f.get("translation", {}).get("my", "") for f in features]
y = [f.get("translation", {}).get("jw", "") for f in features]

inputs = tokenizer(x, return_tensors="pt", padding='max_length', truncation=True, max_length=32)
print("Tokenized input:", inputs)
print("Tokenized input IDs:", inputs['input_ids'])
print("Decoded input:", tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True))

with tokenizer.as_target_tokenizer():
    inputs['labels'] = tokenizer(y, return_tensors="pt", padding='max_length', truncation=True, max_length=48)['input_ids']
print("Tokenized output:", inputs['labels'])
print("Tokenized output IDs:", inputs['labels'])
print("Decoded output:", tokenizer.decode(inputs['labels'][0], skip_special_tokens=True))


Tokenized input: {'input_ids': tensor([[15090,  8365,  1662,  1715,  5375,  5210,   163,   141,  1917,  9029,
           168,  2556, 13950,  3027,  8487, 63921,  4415, 27630, 21271,  1386,
           163,  5805,  1104,   786, 63916,  5042,     2, 64004,     1,     1,
             1,     1],
        [ 8715,  6450,   221,   229,   712,    59,   879,    62,   299,    33,
         49923,    33, 24635,   168,   469,  2187, 63916,     2, 64004,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [ 3498,  5979,  2695,   156,    13,  5565,   171,  5078, 63953,   184,
          1278, 12570,   156,  6915, 36662, 63953,    62,  1123,    59,  2761,
           205,  2627,  3486,    42,    45,    62,  2761,  9154,   592,  2459,
             2, 64004],
        [   59,  3103,  8464,   156,  3103,  8464,  2763,   909,    33,  1039,
           978,    54,  7242,  8464,   156, 18785, 21665,  9484,    88,  5269,
           909,  8709,  9324

**Prepare for Training**

In [None]:
train_data = []
eval_data = []

for line in open("/content/split_dataset/train.txt", "r", encoding='utf-8'):
    text = line.split('\t')
    train_data.append(
        {"translation": {
            "my": text[0].rstrip('\n'),
            "jw": text[1].rstrip('\n')
        }}
    )
print(f"train_data size: {len(train_data)}")

for line in open("/content/split_dataset/dev.txt", "r", encoding='utf-8'):
    text = line.split('\t')
    eval_data.append(
        {"translation": {
            "my": text[0].rstrip('\n'),
            "jw": text[1].rstrip('\n')
        }}
    )
print(f"eval_data size: {len(eval_data)}")

train_data size: 599837
eval_data size: 74979


In [None]:
print(train_data[0])

for i in range(5):
    print(train_data[i])

{'translation': {'my': 'adalah pemain bola sepak Jepun. Dia bermain untuk Roasso Kumamoto, Verspah Oita dan Fujieda MYFC.', 'jw': 'اداله ڤماين بولا سيڤق جيڤون. دي برماءين اونتوق رواسسو كومموتو، ۏرسڤه اوءيتا دان فوجيايد مايفچ.'}}
{'translation': {'my': 'adalah pemain bola sepak Jepun. Dia bermain untuk Roasso Kumamoto, Verspah Oita dan Fujieda MYFC.', 'jw': 'اداله ڤماين بولا سيڤق جيڤون. دي برماءين اونتوق رواسسو كومموتو، ۏرسڤه اوءيتا دان فوجيايد مايفچ.'}}
{'translation': {'my': 'markah persembahan pada malam tersebut dengan markah penilaian lagu yang dibuat sebelum persembahan bagi mendapat markah keseluruhan. Bagi yang mendapat', 'jw': 'مركه ڤرسمباهن ڤد مالم ترسبوت دڠن مركه ڤنيلاين لاڬو يڠ دبوات سبلوم ڤرسمباهن باڬي منداڤت مركه كسلوروهن. باڬي يڠ منداڤت'}}
{'translation': {'my': 'Saudari Maxine Khoo, P.J.K. () atau dikenali dengan Lawyer Maxine ialah seorang peguambela dan peguamcara di Malaysia. Beliau juga merupakan seorang usahawan dan ahli politik pada masa yang sama. Beliau juga meru

**Splitting the dataset into three parts**

In [None]:
from math import ceil

def split_data(data, split_ratio):

    # Ensure the split ratios sum to 1
    assert sum(split_ratio) == 1, "Split ratios must sum to 1."

    total_data = len(data)
    split_1 = ceil(total_data * split_ratio[0])
    split_2 = split_1 + ceil(total_data * split_ratio[1])
    split_3 = split_2 + ceil(total_data * split_ratio[2])

    # Split the data accordingly
    part1 = data[:split_1]
    part2 = data[split_1:split_2]
    part3 = data[split_2:split_3]
    part4 = data[split_3:]

    return part1, part2, part3, part4

In [None]:
# Apply the function to train_data, eval_data, and dev_data
split_ratio = [0.25, 0.25, 0.25, 0.25]

train_part1, train_part2, train_part3, train_part4 = split_data(train_data, split_ratio) #choose which data you want to be separated | start with train_data
eval_part1, eval_part2, eval_part3, eval_part4 = split_data(eval_data, split_ratio)  #choose which data you want to be separated | start with eval_data

# Check the sizes of each part
print(f"train_part1 size: {len(train_part1)}")
print(f"train_part2 size: {len(train_part2)}")
print(f"train_part3 size: {len(train_part3)}")
print(f"train_part4 size: {len(train_part4)}")

print(f"eval_part1 size: {len(eval_part1)}")
print(f"eval_part2 size: {len(eval_part2)}")
print(f"eval_part3 size: {len(eval_part3)}")
print(f"eval_part4 size: {len(eval_part4)}")

train_part1 size: 149960
train_part2 size: 149960
train_part3 size: 149960
train_part4 size: 149957
eval_part1 size: 18745
eval_part2 size: 18745
eval_part3 size: 18745
eval_part4 size: 18744


In [None]:
def validate_data(data):
    for entry in data:
        if 'translation' not in entry or 'my' not in entry['translation'] or 'jw' not in entry['translation']:
            print(f"Invalid entry found: {entry}")
            return False
    return True

if not validate_data(train_data):
    raise ValueError("Training data contains invalid entries.")

In [None]:
validate_data(eval_data)

True

In [None]:
# Hyperparameters
batch_size = 4
learning_rate = 3e-5
epochs = 1

In [None]:
# Converting dataset from list into Datasets format for Seq2Seq compatibility.

from datasets import Dataset

# Keep it random, Adam
train_dataset = Dataset.from_list(train_part1)  # Change the list based on the dataset part being used
eval_dataset = Dataset.from_list(eval_part1)  # Change the list based on the dataset part being used

In [None]:
model = MBartForConditionalGeneration.from_pretrained("./reduced_model")

args = Seq2SeqTrainingArguments(output_dir=result_dir,
                                do_train=True,
                                do_eval=True,
                                per_device_train_batch_size=batch_size,
                                per_device_eval_batch_size=batch_size,
                                learning_rate=learning_rate,
                                num_train_epochs=epochs,
                                eval_strategy="epoch",
                                remove_unused_columns=False,
                                save_strategy="steps",
                                save_steps=20000,
                                save_total_limit=2,
                                report_to="none",
                                )

trainer = Seq2SeqTrainer(model=model,
                         args=args,
                         data_collator=data_collator,
                         train_dataset=train_dataset,
                         eval_dataset=eval_dataset,
                         )

In [None]:
trainer.train()
trainer.save_model(result_dir)



Step,Training Loss,Validation Loss


**Debugging procedure for the Data_Collator**

In [None]:
# Test the data_collator
debug_batch = [train_data[i] for i in range(5)]  # Select a small batch for testing
debug_inputs = data_collator(debug_batch)
print(debug_inputs.keys())


=== Debug Information ===
Original MY: Euro 2020. Dia membuat penampilan sulung pada 2 Jun 2021 dalam perlawanan persahabatan menentang Wales, menggantikan Benjamin Pavard pada separuh masa pertama. Kehidupan
Tokenized MY: [15090, 8365, 1662, 1715, 5375, 5210, 163, 141, 1917, 9029, 168, 2556, 13950, 3027, 8487, 63921, 4415, 27630, 21271, 1386, 163, 5805, 1104, 786, 63916, 5042, 2, 64004, 1, 1, 1, 1]
Decoded MY: Euro 2020. Dia membuat penampilan sulung pada 2 Jun 2021 dalam perlawanan persahabatan menentang Wales, menggantikan Benjamin Pavard pada separuh masa pertama. Kehidupan
Original JW: اييورو ٢٠٢٠. دي ممبوات ڤنمڤيلن سولوڠ ڤد ٢ جون ٢٠٢١ دالم ڤرلاونن ڤرساهابتن مننتڠ ولايس، مڠڬنتيكن بينجامين ڤاۏارد ڤد سيڤروح ماس ڤرتام. كهيدوڤن
Tokenized JW: [8997, 3012, 63916, 310, 1607, 4955, 4845, 114, 140, 1780, 4190, 151, 2292, 12369, 2995, 8488, 63922, 4392, 27546, 56921, 16939, 114, 5163, 723, 664, 63916, 2820, 2, 64001, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded JW: اييور

**Inference**

Let's perform the inference using the resulting model.

In [None]:
model = MBartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Education/Final Year Project I/Output3")
tokenizer = MBartTokenizer.from_pretrained("/content/drive/MyDrive/Education/Final Year Project I/Output3")

In [None]:
sentence = "Boleh saya tau siapa nama awak?"
inputs = tokenizer(sentence, return_tensors="pt")
translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ar_AR"], early_stopping=True, max_length=48)
pred = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
print(f"Rumi - {sentence}")
print(f"Jawi - {pred}")

Rumi - Boleh saya tau siapa nama awak?
Jawi - بوليه ساي تاو سياڤا نام اوق


### **Fine Tuning the Model for Better Accuracy**

Preparing the model and tokenizer

In [None]:
model = MBartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Education/Final Year Project I/Output2")
tokenizer = MBartTokenizer.from_pretrained("/content/drive/MyDrive/Education/Final Year Project I/Output2")

Fine tuning

In [None]:
from transformers import (
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import numpy as np
import re

result_dir = "/content/drive/MyDrive/Education/Final Year Project I/Output3"

In [None]:
tokenizer.save_pretrained(result_dir)

('/content/drive/MyDrive/Education/Final Year Project I/Output3/tokenizer_config.json',
 '/content/drive/MyDrive/Education/Final Year Project I/Output3/special_tokens_map.json',
 '/content/drive/MyDrive/Education/Final Year Project I/Output3/sentencepiece.bpe.model',
 '/content/drive/MyDrive/Education/Final Year Project I/Output3/added_tokens.json')

In [None]:
def data_collator(features: list):
    x = [f.get("translation", {}).get("my", "") for f in features]
    y = [f.get("translation", {}).get("jw", "") for f in features]

    inputs = tokenizer(x, return_tensors="pt", padding='max_length', truncation=True, max_length=32)

    with tokenizer.as_target_tokenizer():
        inputs['labels'] = tokenizer(y, return_tensors="pt", padding='max_length', truncation=True, max_length=48)['input_ids']

    return inputs

In [None]:
# Converting dataset from list into Datasets format for Seq2Seq compatibility.

from datasets import Dataset

# Keep it random, Adam
train_dataset = Dataset.from_list(train_part4)  # Change the list based on the dataset part being used
eval_dataset = Dataset.from_list(eval_part4)  # Change the list based on the dataset part being used

In [None]:
# Hyperparameters
batch_size = 4
learning_rate = 3e-5
epochs = 1

In [None]:
args = Seq2SeqTrainingArguments(output_dir=result_dir,
                                do_train=True,
                                do_eval=True,
                                per_device_train_batch_size=batch_size,
                                per_device_eval_batch_size=batch_size,
                                learning_rate=learning_rate,
                                num_train_epochs=epochs,
                                eval_strategy="epoch",
                                remove_unused_columns=False,
                                save_strategy="steps",
                                save_steps=10000,
                                save_total_limit=2,
                                report_to="none",
                                )

trainer = Seq2SeqTrainer(model=model,
                         args=args,
                         data_collator=data_collator,
                         train_dataset=train_dataset,
                         eval_dataset=eval_dataset,
                         )

In [None]:
trainer.train()
trainer.save_model(result_dir)



Epoch,Training Loss,Validation Loss




Epoch,Training Loss,Validation Loss
1,1.3704,1.284363


