Spaces:

ariahmed
/

kurd-spell-app

Runtime error

App Files Files Community

ariahmed commited on Nov 4, 2024

Commit

e489264

verified ·

1 Parent(s): 5f93c59

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.github/workflows/update_space.yml +28 -0
.gitignore +174 -0
.gradio/certificate.pem +31 -0
README.md +79 -7
app.py +79 -0
ckb_helpers.py +455 -0
create_asosoft_benchmark.py +31 -0
data/Sorani-Arabic.csv +101 -0
data/asosoft_benchmark.csv +0 -0
data/words.json +3 -0
eval.sh +13 -0
inspect_data.ipynb +407 -0
prepare_data/constants.py +25 -0
prepare_data/generate_dataset.py +178 -0
prepare_data/helpers.py +87 -0
prepare_data/interfaces.py +88 -0
prepare_data/process_data.py +193 -0
prepare_data/processes.py +335 -0
prepare_data/processors.py +125 -0
requirements.txt +9 -0
run_summarization.py +793 -0
train.sh +59 -0
train_tokenizer.py +39 -0

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.DS_Store
+train.csv
+test.csv
+Kurd-Spell/
+tokenizer
+sn_project
+notes.md
+# Data dir
+data/*
+!data/words.json
+!data/asosoft_benchmark.csv
+!data/Sorani-Arabic.csv

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,12 +1,84 @@
 ---
-title: Kurd Spell App
-emoji: 📊
-colorFrom: purple
-colorTo: purple
 sdk: gradio
 sdk_version: 5.4.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: kurd-spell-app
+app_file: app.py
 sdk: gradio
 sdk_version: 5.4.0
 ---
+# Central Kurdish Neural Spell Corrector
+<p align="center">
+    <img src="https://www.razhan.ai/_next/image?url=/static/images/projects/spell-checker.webp&w=1200&q=75" alt="Banner Image" height="240" width="1200">
+    <br>
+    <a href="https://huggingface.co/razhan/bart-kurd-spell-base">
+        [🔥 Best model]
+    </a>
+    <a href="https://huggingface.co/models?search=bart-kurd-spell">
+        [📀 Models]
+    </a>
+    <a href="https://huggingface.co/spaces/razhan/Kurd-Spell">
+      [🤗 Demo]
+    </a>
+</p>
+> **Note:** The documentation for this project is currently being written. I am working hard to make this project easily hackable so people can add new heuristics and train more models.
+This repository contains a collection of neural spell correctors for the Central Kurdish language.These models have been trained on an extensive corpus of synthetically generated data. They are able to correct a wide range of spelling errors, including typos and grammatical errors.
+Using various heuristics, we generate a rich dataset by mapping sequences containing misspellings to the correct sequence. We do this by randomly inserting valid characters, deleting characters or patterns, substituting characters with random ones or their keyboard neighbors, swapping two adjacent characters, shuffling sentences, and replacing specific predefined patterns with targeted alternatives.
+## Experiments
+The error injection framework in `prepare_data` offers a method to inject errors according to a distortion ratio. I conducted the following experiments to determine the optimal ratio that allows the model to achieve the lowest Word Error Rate (WER) and Character Error Rate (CER) on the synthetic test set.
+| Model Name                                                       | Dataset Distortion| CER   | WER    |
+|------------------------------------------------------------------|-------------------|-------|--------|
+| [bart-base](razhan/bart-kurd-spell-base-05)                      | 5%                | 5.39% | 34.73% |
+| [bart-base](razhan/bart-kurd-spell-base-05)                      | 10%               | 2.15% | 11.19% |
+| [bart-base](https://huggingface.co/razhan/bart-kurd-spell-base-05_10)| Mixed (5% + 10%)| **1.54%** | **8.31%** |
+| [bart-base](https://huggingface.co/razhan/bart-kurd-spell-base)  |  15%               | 2.17% | 12.3% |
+## Evaluation on ASOSOFT Spelling Benchmark
+The benchmark for this [project](https://github.com/AsoSoft/Central-Kurdish-Spelling-dataset) is exclusively designed for single-word spelling corrections. The script `create_asosoft_benchmark.py` processes each word from the Amani dataset by searching for sentences with the correct spelling, checking if the sentence has not been included in `train.csv` and replaces it with the provided misspelling. This is hacky way to get a gold-standard benchmark. The current best-performing model achieves the following results:
+| Metric   | Value  |
+|----------|--------|
+| CER      | 9.6545 |
+| WER      | 21.7558|
+| Bleu     | 68.1724|
+## Evluation on Sorani Script Normalization Benchmark
+The final generated dataset is also concatenated with the training dataset from [Script Normalization for Unvonventional Writing](https://github.com/sinaahmadi/ScriptNormalization/tree/main) project. Therefore, the model not only correct spelling but also normalize unconventional writings. "Unconventional Writing" means using the writing system of one language to write in another language.
+They also employ a similiar approach to generate their data. But it's not wise to evaluate your model on the synthetic test set since the model can memorize the underlying patterns from the training set. Hence they provide a gold-standard benchmark for Central Kurdish and they use `Bleu` & `chrF` to measure the performance of their model.
+| Model                 | Bleu  | chrF  |
+|-----------------------|-------|-------|
+| Script Normalization  | 12.7  | 69.6  |
+| Bart-kurd-spell-base  | 13.8  | 73.9  |
+> Keep in mind of both these models have seen the same data for script normalization but our model is performing slighly better due to the additional data for spell correction.
+## Train a New Model
+Since the problem is framed as mapping a sequence containing misspellings to a correct sequence, we can train different econder-decoder models such as T5.
+1. Run [`train_tokenizer.py`](train_tokenizer.py) to build tokenizer for your chosen model with `--tokenizer_name` argument.
+2. Create `data.txt` and put it in [`data`](data) dir. Check [`inspect_data.ipynb`](inspect_data.ipynb).
+3. Check the arguments of [`pepare_data/process_data.py`](pepare_data/process_data.py) and run it to get `train.csv` and `test.csv`
+4. Change the arguments in [`train.sh`](train.sh) if your want to train a different model other than Bart. In case you want to train T5, you need to add `--source_prefix "correct: "`.
+5. Evaluate the model on both [`data/asosoft_benchmark.csv`](data/asosoft_benchmark.csv) and [`data/Sorani-Arabic.csv`](data/Sorani-Arabic.csv) using  [`eval.sh`](eval.sh)
+## Observations
+Different heuristics could be added to the pipeline, for example, replacing ر at the start of every word with ڕ or replacing ك with ک. These aforementioned examples occur quite often in Central Kurdish texts online. But both of these problems can be solved using rule-based instead of being learned from the data. It is more practical to address such problems using rule-based solutions such as [`KLPT`](https://github.com/sinaahmadi/klpt).
+But in case you can think of more heuristics, they can be easily added to the pipeline in the [`get_text_distorter`](prepare_data/processors.py#L111) function.
+PRs with additional models, evaluation, or data generation heuristics are welcome! 👍
+## References
+- https://arxiv.org/abs/1910.13461
+- https://www.researchsquare.com/article/rs-2974359/v1
+- https://arxiv.org/abs/2305.16407

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import gradio as gr
+from difflib import Differ
+from transformers import pipeline
+model_id = "razhan/bart-kurd-spell-base"
+# spell_corrector = pipeline("text2text-generation", model=model_id, return_all_scores=True)
+spell_corrector = pipeline("text2text-generation", model=model_id, max_length=1024)
+def correct_spell(text):
+    d = Differ()
+    if text is None:
+        text = ""
+    corrected = spell_corrector(text)[0]['generated_text']
+    return [
+        (token[2:], token[0] if token[0] != " " else None)
+        for token in d.compare(text, corrected)
+    ], corrected
+demo = gr.Interface(
+    correct_spell,
+    [
+        gr.Textbox(
+            label="Input text",
+            info="Initial text to be corrected",
+            lines=3,
+            value="نوووسینێکی ڕااست  بێهەڵە",
+            rtl=True
+        ),
+    ],
+    outputs=[
+        gr.HighlightedText(
+            label="Diff",
+            combine_adjacent=True,
+            show_legend=True,
+            color_map={"-": "pink", "+": "green"},
+            rtl=True,
+            # container=True,
+            elem_id="kurdi"
+        ),
+        gr.Textbox(label="Corrected Text", rtl=True, container=True)
+    ],
+    examples=[
+        "حکومەتلە گفتوگۆحانی پەرلەماندا لەسەربودجەی نوێ ڕایگەیاند کە لە دەنگدانلەسەر بودجە بەردەوام دەبێت",
+        "ژنەڤ کاندغدێکی کورد نەشتەرگەری بۆکەا",
+        "فەستبخەرکرانی سێ هاووڵاتی لە شاری بۆکانلە لاین هێزە ئەمنییکەانەوە",
+        "ئەم وێنجانەی وخارەوەش چەند ێونەیەکی دەزپێرکاوی مۆبایلەکەن",
+        "خۆگزە توانیبام ژیان لە دیداری یەکەی ژاچگرێ بدەم",
+        "هەرفەرمانبەرێک بەناشچایستە پلەی نوەزیفیوەرگرتبێتلێیدەسەرنێتەەو",
+        "ماوەیەکەدەست ەب ئاامدەکسری کرا٦وە بۆ بەڕێوەچوونی ەششەمین فیستیڤاڵینێودەوڵەتیی هەولێرب ۆ شانۆ",
+        "ەڵم ئارەزوومە کە فیلمێک لە سەرحۆریەکانی ێجەریای نێوچیڕۆکەکانیشەوان عەرەبیەوە بەرخهەم بهێنم",
+        "پارەی ئەلکتترۆنیکی هیان راوی دیجیتاڵ جۆرە راوێکە کە تەنیا بە شێوەی ئەلیکترۆنیکی لەبەردەستەایە"
+    ],
+    title="Central Kurdish Neurl Spell Correction",
+    # description="This is made as a fun side project, it's not to be relied on for production.",
+    css="""
+    #kurdi {
+        text-align: right;
+    }
+    """,
+    theme=gr.themes.Base(
+        primary_hue="pink",
+        secondary_hue="stone",
+        text_size=gr.themes.sizes.text_lg,
+        spacing_size=gr.themes.sizes.spacing_lg,
+        radius_size=gr.themes.sizes.radius_lg,
+        font=gr.themes.GoogleFont("Noto Sans"),
+    ),
+    allow_flagging='auto'
+)
+if __name__ == "__main__":
+    demo.launch()

ckb_helpers.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import re
+from klpt.preprocess import Preprocess
+from klpt.tokenize import Tokenize
+import unicodedata
+preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Arabic")
+tokenizer_ckb = Tokenize("Sorani", "Arabic")
+unify_numbers = {
+    "٠|۰": "0",
+    "١|۱": "1",
+    "٢|۲": "2",
+    "٣|۳": "3",
+    "٤|۴": "4",
+    "٥|۵": "5",
+    "٦|۶": "6",
+    "٧|۷": "7",
+    "٨|۸": "8",
+    "٩|۹": "9"
+}
+# Taken from AsoSoft library
+def number_to_word(text):
+    # convert numbers to latin
+    for k, v in unify_numbers.items():
+        text = re.sub(k, v, text)
+    text = re.sub(r"([0-9]{1,3})[,،](?=[0-9]{3})", r"\1", text);  # remove thousend seperator  12,345,678 => 12345678
+    text = re.sub(r"(?<![0-9])-([0-9]+)", r"ناقس \1", text);  # negative
+    text = text.replace("٪", "%") # Replace arabic percent sign with latin
+    text = re.sub(r"(?<![0-9])% ?([0-9]+)", r"لە سەددا \1", text);    # percent sign before
+    text = re.sub(r"([0-9]+) ?%", r"\1 لە سەد", text);    # percent sign after
+    text = re.sub(r"\$ ?([0-9]+(\.[0-9]+)?)", r"\1 دۆلار", text)    # $ querency
+    text = re.sub(r"£ ?([0-9]+(\.[0-9]+)?)", r"\1 پاوەن", text)  # £ querency
+    text = re.sub(r"€ ?([0-9]+(\.[0-9]+)?)", r"\1 یۆرۆ", text)   # € querency
+    # convert float numbers
+    text = re.sub(r"([0-9]+)\.([0-9]+)", lambda x: float_name(x.group(1), x.group(2)), text)
+    # convert remaining integr numbers
+    text = re.sub(r"([0-9]+)", lambda match: integer_name(match.group(1)), text)
+    return text
+def float_name(integerPart, decimalPart):
+    zeros = re.search("^0+", decimalPart)
+    point = " پۆینت "
+    if(zeros):
+        point = point + re.sub("0", " سفر ", zeros[0])
+    return integer_name(integerPart) + point + integer_name(decimalPart)
+ones = ["", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ"]
+teens = [ "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" ]
+tens = [ "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد"]
+hundreds = ["", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد"]
+thousands = ["", " هەزار", " ملیۆن", " ملیار", " ترلیۆن", " کوادرلیۆن", " کوینتلیۆن"]
+def integer_name(inputInteger):
+    output = ""
+    if (inputInteger != "0"):
+        temp = inputInteger
+        for i in range(0, len(inputInteger), 3):
+            matched_numbers = re.findall(r"[0-9]{1,3}$", temp)
+            currentThree = matched_numbers[0] if matched_numbers else ""
+            temp = temp[:len(temp) - len(currentThree)]
+            currentThree = currentThree.rjust(3, '0')
+            C = int(currentThree[0])
+            X = int(currentThree[1])
+            I = int(currentThree[2])
+            conjunction1 = " و " if (C != 0) and (X != 0 or I != 0) else ""
+            conjunction2 = " و " if X != 0 and I != 0 else ""
+            if (X == 1):
+                currentThree = hundreds[C] + conjunction1 + teens[I]
+            else:
+                currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I]
+            currentThree += "" if currentThree == "" else thousands[i // 3]
+            conjunction3 = "" if output == "" else " و "
+            if (currentThree != ""):
+                output = currentThree + conjunction3 + output
+        output = output.replace("یەک هەزار", "هەزار")
+    else: # if input number = 0
+        output = "سفر"
+    return output
+def replace_words_in_corpus(sentence):
+    modified_corpus = []
+    words = sentence.split()
+    modified_words = []
+    for word in words:
+        if word in word_replacements:
+            modified_words.append(word_replacements[word])
+        else:
+            modified_words.append(word)
+    modified_sentence = " ".join(modified_words)
+    return modified_sentence
+# put this in a json file
+word_replacements = {
+    "ههڵاڵەەي": "هەڵاڵەی",
+    "وهەمهەمه": "وهەمهەمه",
+    "ئهباتههوه": "ئەباتەوە",
+    "بەخءرایی": "بەخێرایی",
+    "ئیثانۆڵ": "ئیسانۆڵ",
+    "عەبدوڵڵاهـ": "عەبدوڵڵا",
+    "کولاهـ": "کولاه",
+    "ئاھ": "ئاه",
+}
+char_replacements = {
+    '\u200e': '',
+    '\u200f': '',
+    '\u200c': '',
+    'õ': '',
+    'ھ': 'ه'
+}
+def apply_char_replacements(text: str):
+    for old, new in char_replacements.items():
+        text = text.replace(old, new)
+    return text
+def remove_arabic_alphabets(text: str):
+    """
+    Removes ``Arabic`` words and digits from a ``text``
+    Args:
+         text (str): Sorani text
+    Returns:
+        str: ``str`` object with arabic alphabets removed
+    """
+    characters = "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ"
+    table = str.maketrans({key: None for key in characters})
+    return text.translate(table)
+def filtered_arabic_characters():
+    kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ")
+    arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ")
+    # Create a new set of Arabic characters without the Kurdish characters
+    filtered_arabic_characters = arabic_characters - kurdish_characters
+    return ''.join(filtered_arabic_characters)
+def is_arabic_string(text):
+    """Returns True if the text contains any Arabic characters, False otherwise."""
+    # arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ")
+    arabic_characters = filtered_arabic_characters()
+    for ch in text:
+        if ch in arabic_characters:
+            return True
+    return False
+def contains_arabic(text):
+    arabic_characters = filtered_arabic_characters()
+    return any(char in arabic_characters for char in text)
+def is_english_string(text):
+    """Returns True if the text contains only English characters, False otherwise."""
+    english_pattern = re.compile(r'[a-zA-Z]')
+    return bool(english_pattern.search(text))
+def remove_english_alphabets(text: str):
+    """
+    Removes ``English`` words and digits from a ``text``
+    """
+    characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    table = str.maketrans({key: None for key in characters})
+    return text.translate(table)
+def resolve_ae(text):
+    """
+    This function takes a text input in Central Kurdish (Sorani) script and performs a series of character replacements
+    to standardize variations in the script. Specifically, it addresses cases where the character 'ە' (Arabic letter
+    AE) may be used in different contexts.
+    """
+    # First replace all occurrences of 'ه' with 'ە'
+    text = re.sub("ه", "ە", text)
+    # Replace specific combinations with 'ها', 'هێ', and 'ه'
+    text = re.sub("ەا", "ها", text)  # Replace ەا with ها
+    text = re.sub("ەێ", "هێ", text)  # Replace ەێ with هێ
+    text = re.sub("ەۆ", "هۆ", text)  # Replace ەۆ with هۆ
+    # Replace ە (AE) at the beginning of a word with ه (HEH)
+    text = re.sub(r"\b(ە\w*)", lambda match: "ه" + match.group(1)[1:], text)
+    #  Replace ALEF+AE with ALEF+HEH
+    text = re.sub("اە", "اه", text)
+    # Special words should go here before the replcement of 'ە' at the end of the word
+    # Special case: گەهـ or گاهـ but without the tatweel since tatweel is not a phoneme in Kurdish and it will be a class for the model
+    text = re.sub(r'\bگەە[-ـ]?\b', "گەه", text)
+    # Replace 'ەە' at the beginning and end with 'هە'
+    text = re.sub(r"\bەە|ەە\b", "هە", text)
+    # Special case if two AEs come before ۆ it should be replaced with AE+HEH
+    text = re.sub(r"ەە(?=ۆ)", "ەه", text)
+    # Special case if two AEs come after either و or ب or ئ or ڕ or ق or ز they should be replaced with AE+HEH
+    text = re.sub(r"(?<=\b[بوئڕقزژ])ەە", "ەه", text)
+    # The following special case should happen after the previous special case and before the following speciall case
+    # Special case when two words are together with waw and the the AEs after the waw becomes HEH+AE
+    text = re.sub(r'(?<=و)ەە(?=\w)', "هە", text)
+    # Replace Three AEs with AE+HEH+AE (This has to be run before the following special case so words like لەهەوادا will not be ruined)
+    text = re.sub(r"(?<=\w)ەەە(?=\w)", "ەهە", text)
+    # Special case if two AEs are in the middle of a word and come before YEH ی or TCHEH چ or و they will be replaced with AE+HEH if  the YEH or TCHEH are not at the END of the word
+    text = re.sub(r"(?<=\w)ەە(?=[چیو]\B)", "ەه", text)
+    # Replace 'ەە'AE+AE in the middle of a word with HEH+AE
+    text = re.sub(r"(?<=\w)ەە(?=\w)", "هە", text)
+    # Replace two AE with spaces in between with AE HEH
+    text = re.sub("ە ە", "ە ه", text)
+    # Replace all HEH DOACHASHMEE with HEH
+    # text = text.replace('ھ', 'ە')
+    return text
+clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!\/](?!\d)")
+def remove_punctuation(text):
+    """Remove all punctuation from string, except if it's between digits"""
+    return clean_punctuation.sub("", text)
+def extract_punctuation(text):
+    # Initialize an empty string to store the extracted punctuation
+    extracted_punctuation = ""
+    # Iterate through each character in the input text
+    for char in text:
+        # Check if the character is categorized as punctuation
+        if unicodedata.category(char).startswith('P'):
+            extracted_punctuation += char  # Add it to the result
+    return set(extracted_punctuation)
+ARABIC_PUCTUATIONS = "،؛۔٫٪؟"
+CKB_PUNCTUATIONS = "!.:;?،؛؟«»"  + ARABIC_PUCTUATIONS
+KURDISH_CHARS = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
+def contains_non_kurdish_characters(text):
+    # kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
+    kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
+    non_kurdish_chars = set(text) - kurdish_characters
+    return len(non_kurdish_chars) > 0
+def keep_kurdish_characters(text):
+    kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ")
+    cleaned_text = ''.join(char for char in text if char in kurdish_characters)
+    return cleaned_text
+def remove_emojis(text):
+    emoji_pattern = re.compile("["
+                               "\U0001F600-\U0001F64F"  # Emoticons
+                               "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
+                               "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
+                               "\U0001F700-\U0001F77F"  # Alchemical Symbols
+                               "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
+                               "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
+                               "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
+                               "\U0001FA00-\U0001FA6F"  # Chess Symbols
+                               "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
+                               "\U00002702-\U000027B0"  # Dingbats
+                               "]+", flags=re.UNICODE)
+    return emoji_pattern.sub(r'', text)
+def remove_language_families(text):
+    patterns = [
+        "[\u1100-\u11FF\u2E80-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF]+",  # Asian scripts
+        "[\u0000-\u024F]+",  # Basic Latin and Latin-1 Supplement
+        "[\u0400-\u04FF]+",  # Cyrillic
+        "[\u0370-\u03FF]+",  # Greek
+        "[\u0900-\u097F]+",  # Devanagari
+        r"\u0B80-\u0BFF",  # Tamil
+        r"\u4E00-\u9FFF",  # Han
+        r"\u10A0-\u10FF",  # Georgian
+        r"\u0C80-\u0CFF"   # Kannada
+    ]
+    combined_pattern = re.compile("|".join(patterns))
+    cleaned_text = combined_pattern.sub(r'', text)
+    return cleaned_text
+clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!،.؟؛:](?!\d)")
+def remove_punctuation(text):
+    """Remove all punctuation from string, except if it's between digits"""
+    return clean_punctuation.sub("", text)
+def contains_repeated_ngram(window, n):
+    ngrams = generate_ngrams(window, n)
+    ngram_set = set(ngrams)
+    return len(ngrams) != len(ngram_set)
+def generate_ngrams(text, n):
+     words = text.split()
+     output = []
+     for i in range(len(words)- n+1):
+         output.append(tuple(words[i:i+n]))
+     return output
+def remove_repeated_ngram(text, n):
+    words = text.split()
+    output = []
+    for i in range(len(words)- n+1):
+        if not contains_repeated_ngram(" ".join(words[i:i+n]), n):
+            output.append(words[i])
+    return " ".join(output)
+def normalize_punctuations(text: str) -> str:
+    # Replace , with ،
+    text = text.replace(',', '،')
+    # Replace ? with ؟
+    text = text.replace('?', '؟')
+    # Replace two or three of the same punctuation marks with a single one
+    text = re.sub(r'([.,;:?!،؛؟])\1{1,2}', r'\1', text)
+    # Replace double opening and closing parentheses with guillemets
+    text = re.sub(r'\(\(', '«', text)
+    text = re.sub(r'\)\)', '»', text)
+    # Normalize space around the guillemets and other punctuation marks
+    text = re.sub(r'\s*«\s*', ' «', text)
+    text = re.sub(r'\s*»\s*', '» ', text)
+    # Additional punctuation normalization
+    text = re.sub(r'\s*([,،؟])\s*', r'\1 ', text)
+    # Ensure there is no space before a guillemet at the beginning of the text or after a
+    # guillemet at the end of the text
+    text = re.sub(r'^\s*«', '«', text)
+    text = re.sub(r'»\s*$', '»', text)
+    # If multiple punctuation marks come after each other only keep the first one
+    # text = re.sub(r'([.!?؟،؛])\1+', r'\1', text)
+    # if conective punctuation marks come after each other only keep the first one
+    text = re.sub(r'([.!?؟،؛])\1+', r'\1', text)
+    # if punctuation marks come after each other with space between them like: ? ? ? keep the first one remove the rest
+    text = re.sub(r'([.!?؟،؛])\s\1+', r'\1', text)
+    # Trim leading and trailing spaces and return the normalized text
+    text = text.strip()
+    return text
+def fix_sentence(sentence):
+    if sentence.startswith('"') and sentence.endswith('"'):
+        # we can remove trailing quotation marks as they do not affect the sentence
+        sentence = sentence[1:-1]
+    if sentence[-1] not in [".", "?", "!"]:
+        # append a full-stop to sentences that do not end in punctuation
+        sentence = sentence + "."
+    # sentence = sentence[:-1].translate(str.maketrans('', '', string.punctuation)) + sentence[-1]
+    return sentence
+def add_period_abbreviations(text):
+    abbreviations = set(["پ", "د"])  # Add more abbreviations as needed
+    # Define a regular expression pattern to match a letter followed by a space and then a word character
+    pattern = re.compile(r'([{}]) (?=\w)'.format(''.join(abbreviations)))
+    # Use regex to add periods after the specified abbreviations with a space after the period
+    text = pattern.sub(r'\1. ', text)
+    # Add periods after each letter if "د" and "خ" appear together
+    text = re.sub(r'د\sخ|خ ?د|د\.?خ|خ\.?د', 'د. خ.', text)
+    # Abbreviated dates
+    # text = re.sub(r'\b(پ\. ز)\b', r'\1.', text)
+    return text
+def process_text(text):
+    # text = replace_words_in_corpus(text)
+    text = resolve_ae(text)
+    # text = number_to_word(text)
+    text = preprocessor_ckb.preprocess(text)
+    # text = normalizer(text).strip()
+    text = remove_emojis(text)
+    text = normalize_punctuations(text)
+    text = fix_sentence(text)
+    text = apply_char_replacements(text)
+    return text
+if __name__ == "__main__":
+    # text = "لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت. 'õ'\u200c\u200f\u200e'ھ'"
+    # print(process_text(text))
+    # print(contains_non_kurdish_characters(text))
+    # text = "دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟"
+    # correct = "دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟"
+    # print("Before punctuation normalization:", text)
+    # print("After punctuation normalization:", normalize_punctuations(text))
+    # print("Correct:\t\t\t", correct)
+    # print(normalize_punctuations(text) == correct)
+    # print(normalize_punctuations("ڕەوا بورهان 4 تەمموز ، کوردستانی سلێمانی?!!"))
+    # print(normalize_punctuations("یانەی کوردژین   تکایە  چۆن بە شی سە ڕە کی و لاوە کی بۆ مالپە ڕە کە م زیاد بکە م؟؟ ؟ ؟ لە  سکرێپە یتی ژومیلە"))
+    # with open('data/data.ckb.txt', 'r', encoding='utf-8') as src_file:
+    #     source_data = src_file.read()
+    # unified_data = normalize_punctuations(source_data)
+    # # Save the unified data to a new file
+    # with open('data/unified_data.txt', 'w', encoding='utf-8') as file:
+    #     file.writelines(unified_data)
+    # print("Unified data saved to unified_data.txt")
+    text = "Hello ((Friend)) Hello ,  Friend World"
+    # print(remove_repeated_ngram(text, 2))
+    # print(remove_repeated_ngrams(text, ))
+    print(process_text(text))

create_asosoft_benchmark.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pandas as pd
+from tqdm import tqdm
+from ckb_helpers import *
+df = pd.read_csv('data/asotest.csv')
+data_df = pd.read_csv('data/data.txt', names=['text'])
+train_df = pd.read_csv('train.csv')
+data = []
+pbar = tqdm(df.itertuples(), total=len(df))
+for row in pbar:
+    incorrect_word = row.text
+    correct_word = row.summary
+    # look up sentences from data_df that contain correct_word and make only keep those rows that are not  in train_df
+    sentences = data_df[data_df['text'].str.contains(correct_word, case=False, na=False)]
+    sentences = sentences[~sentences.text.isin(train_df.summary)]
+    pbar.set_description(f"Rows found after cross checking train data: {len(sentences)} for {correct_word}")
+    for r in sentences.head(1).itertuples():
+        new_sentence = r.text.replace(correct_word, incorrect_word)
+        data.append({"text": new_sentence, "summary": process_text(r.text)})
+        # drop that row so the final dataset doesn't include same sentence for two incorrect words
+        data_df.drop(index=r.Index, axis=0, inplace=True)
+df = pd.DataFrame(data)
+df.to_csv('asosoft_spell.csv', index=False)

data/Sorani-Arabic.csv ADDED Viewed

	@@ -0,0 +1,101 @@

+text,summary
+عةمرت نةمينئ نةت توانى يةك بزمار بيبةى,عەمرت نەمینێ نەتوانی یەک بزمار ببەی.
+بةرهةمةكاني ئةفين ئاسؤ,بەرهەمەکانی ئەڤین ئاسۆ
+دانا بويته فليمي كجةن,دانا بووەتە فیلمی کچان.
+به و برچه الی هاوریان,بەو پرچە ئەلێی هاوڕێیان
+ئاخر خواية تاواني ئةم منالة نةكبةتة جية,ئاخر خودایە تاوانی ئەم مناڵە نەگبەتە چییە؟
+به رنامه يه ك بودروست كردني بي ئابروي,بەرنامەیەک بۆ دروستکردنی بێئابڕویی
+سةيري پيكةنينةكةي سونيا,سەیری پێکەنینەکەی سۆنیا
+ئةبي كي كچ بدات بةوانة,ئەبێ کێ کچ بدات بەوانە؟
+هةريم قةرةناوي,هەرێم قەرەناوی
+خه تاي دانا بوو بو ئالان ياري بكردايه له كه ل ي ئه ي برده وه,خەتای دانا بوو، بۆ ئاڵان یاریی بکردایە لەگەڵی ئەی بردەوە
+هه ى حه مرى خؤتو كه ناكه تو ميوانه كانت نه مينئ,هەی عەمری خۆت و کەناڵەکەت و میوانەکانت نەمێنێت
+جوئن مه دن واعیساب که ن سئ کچ به رنامه پئشکه ش ده که ن,جوێن مەدەن، وا حیساب بکەن ٣ کچ بەرنامە پێشکەش دەکەن
+بیره زن بلی توچیت دایه له میکیاج,پیرەژن، بلێ تۆ چیت داوە لە مکیاج!
+جا توپه بةفرى ويكةت با باشتر بوو,جا تۆپەڵە بەفری پێکەتبا باشتر بوو
+هةموو نةوعة كةوتنيكي تاقي كردةوة,هەموو نەوعە کەوتنێکی تاقی کردەوە
+به س بيم بلين جواني ئه م مه يمونه له كويايه,بەس پێم بڵێن جوانیی ئەم مەیمونە لە کوێدایە؟
+بؤ كؤمنته كان ناخوينيه وه,بۆ کۆمێنتەکان ناخوێنیتەوە؟
+چند ناخؤشه تةريق بيتةوه,چەند ناخۆشە تەریق بیتەوە
+دوو مه يموني هيناوه به ده مي يه ك پيده كه نن,دوو مەیموونی هێناوە بەدەمی یەکتر پێئەکەنن
+باخوا يارا شئرا شئررررر,بە خوا یارا شێرە شێر.
+ده ي رسقي ئه مانيش ببرن,دەی ڕزقی ئەمانیش ببڕن
+فقيرا ملي شكا,فەقیرە ملی شکا
+خوزكه جونكيله ش بان,خۆزگە جوانکیلەش بان
+وةلاي تةسميل مةوة ريال كةي بةشةرة بةس بةرشة بةشةرة,وەلاهی تەسمیل مەبە، ڕیاڵ کەی بەشەرە، بەس بەرشە بەشەرە.
+شەربت هەنار اخوی,شەربەت هەنار ئەخۆی؟
+ده ست جاوت خوشيبي,دەست و چاوت خۆش بێت
+ئه م حه مه يسك قورسه به جي وا زه عيف بوه,ئەم حەمەیە ئێسک قورسە، بەچی وا زەعیف بووە؟
+يةعني ئاوي تةماتةك ئةوةي دةوي,یەعنی ئاوی تەماتەک ئەوەی دەوێ
+كومپانباي دزيني ئوتومبيل يش په يدابوو,کۆمپانیای دزینی ئۆتۆمبێلیش پەیدا بوو
+صةلاح بالابةرز,سەلاح باڵابەرز
+له و بيكه نه نه ي ئه لي ته قه ده كا,لەو پێکەنینەی! ئەلێی تەقە دەکات
+خوا که سیک رسوا بکه  وه ک ئه مه ی  لی  ئه کا,خودا کەسێک ڕیسوا بکات، وەک ئەمەی لێ ئەکات
+كوناح نيه سازان ميرديك كه جه لى هه بيت,گوناح نییە سازان مێردێکی کەچەڵی هەبێت
+هةرچي شيتو پاتالة لةم نيتة تةرةماشةية كةسيكي عاقلمان نةبيني,هەرچی شێت و پاتاڵە لەم نێتە تەڕەماشەیە، کەسێکی ئاقڵمان نەبینی
+سازان خوي كردوتة عةنكةبوت جامانةى لةسةره,سازان خۆی کردۆتە عەنکەبووت، جامانەی لەسەرە.
+كوره كان بس نازانن ياري بكه ن,کوڕەکان بەس نازانن یاری بکەن
+حاجي سةيفةديت زور بيژي ديارة,حاجی سەیفەدین ، زۆر بیژی دیارە
+جند بةبي يشي قسةت كرد,چەند بەبێ ئیشی قسەت کرد.
+له ئيستاوه ئزانم كي ايباته وه ديارة نيگار يه كمة,لە ئێستاوە ئەزانم کێ ئەیباتەوە، دیارە نیگار یەکەمە
+ئؤنده به فره ى بي نه كه ت ئؤنده به دارو ديوار كه ت,ئەوەندە بەفرەی پێنەکەوت، ئەوەندەی بە دارودیوار کەوت
+سرنجي بؤق راكيشي جونكه هه رله بؤق اجيت,سەرنجی بۆق ڕائەکێشیت، چونکە هەر لە بۆق ئەچیت.
+لاي هه ديك بياو ا��افره تانه به جاك ده زانن,لای هەندێک پیاو، ئەو ئافرەتانە بە چاک دەزانن
+لاندكرؤز  كو دةدزری،،،،,لاندگرۆزەر کو دەدزرێ؟
+هة ناره يه كة م دابي,هەنارە یەکەم دەبی
+بة قسةى من دةكةى هةسته بةخوت بروة مالى با دةرت نةكةن,بەقسەی من دەکەی، هەستە بۆ خۆت بڕۆوە ماڵێ، با دەرت نەکەن
+ئةو كچة رزاي زور قورسة,ئەو کچە ڕەزای زۆر قوڕسە
+جاخؤدوتؤپةلى تئ گرن باشترة,جا خۆ دوو تۆپەڵەی تێگرن باشترە
+روى باوكى ئه وه ره شبيت (ته مارا)ى كرد به بيشكه شكار,ڕووی باوکی ئەوە ڕەش بێت (تەمارا) ی کردە پێشکەشکار.
+له پيرلؤ خوئيريتر تؤيت,لە پێڕلۆ خوێڕیتر تۆیت.
+تخوا ئةمة شتة سةيري دةكةن,تخوا ئەمە شتە سەیری دەکەن
+لة داخي دواني وا حةزةكةم ئةم ولاتة جئ بيلم,لەداخی دووانی وا، حەز ئەکەم ئەم وڵاتە جێبێلم
+ئينشةلا كوراكاو دهبةنةوة,ئینشائەڵڵا کوڕەکان دەبەنەوە
+ام كجه بيويسته ببريته نه خؤشخانه ى ده رونى,ئەم کچە پێویستە ببرێتە نەخۆشخانەی دەروونی
+ئه ى بو ئاسايشى هه وليئر دزه گه وره كان ده ستگير نا كريئن.,ئەی بۆ ئاساییشی هەولێر دزەگەورەکان دەزگیر ناکەن؟
+هةر ماعدة مابوو تداخلى بكةن,هەر ماعیدە مابوو تەداخولی بکەن
+به خواتابليي به رنامه يه كي هيجه جاجلوبه ركي ناشيرين,بەخوا تابڵێی بەرنامەیەکی هیچە، جا جلوبەرگی ناشرین
+وةالله شتةكم لة دةست بواية ريك كةنالةكةم دادةخست,وەڵڵا شتەکم لەدەست بوایە ڕێک کەناڵەکەم دادەخست.
+توخوا اوة بةرنامةية,تخوا ئەوە بەرنامەیە؟
+تةنانةت ليرةش غةدرتان لة كةركوك كرد عةمرتان نةمينى,تەنانەت لێرەش غەدرتان لە کەرکوک کرد، عەمرتان نەمینێ.
+بروا  بكةن ئةمن بةس تةماشاي ريكارم كرد,بڕوا بکەن ئەمن بەس تەماشای ڕێکارم کرد
+ئةمة فیلبو  حكم تاوانبارة جونكة كوتی نابی كةس قةسةبةكات,ئەمە فێڵ بوو عەکەم تاوانبارە، چونکە وتی نابێ کەس قسە بکات.
+داناش وةك يارا بةس فشةفش دةكات هيجيش ناباتةوة,داناش وەک یارا بەس فشەفش دەکات، هیچیش ناباتەوە
+خواية شوكرم بةبةشت لةباتى باران  فيتنة دةبري,خودایە شوکرم بە بەشت، لەجیاتیی باران ، فیتنە دەبارێ
+گةردةلول بةخيوي كردوم بيمنتةتم لة رةشةبا,گەردەلول بەخێوی کردووم، بێمنەتتم لە ڕەشەبا.
+اليي شيره به مه ييه كه يه دارماسيحه,ئەلێی شیرە پەمەیەکەیە، دارماسیحە
+زؤر ركم لة پيشكةشكارةكةية اسلوبيى قسةى زؤر ناشرينة,زۆر ڕقم لەپێشکەشکارەکەیە، ئسلوبی قسەکردنی زۆر ناشرینە
+بيكه نينه كه شي له هى به شه ر ناجى,پێکەنینەکەشی لە هیی بەشەر ناچێت
+جاوةرة ئةوكفتةي بخؤي بةونينؤكةوة,جا وەرە ئەو کفتەی بخۆی بەو نینۆکەوە
+امه له مه ريخ ده زي,ئەمە لەمەریخ دەژی
+نرخي بۆ خۆمان جەندە,نرخی بۆ خۆمان چەندە؟
+خواي بتكات قورباني كجيكي عه شاير,خودا بتکات بە قوربانی کچێکی عەشایەر
+خوت فيره قسه بكه اوجا به رنامه ى پيشكه ش بكه,خۆت فێرە قسە بکە، ئەوجا بەرنامە پێشکەش بکە
+كورةتووخواتؤشةرم لةخؤت ناكةي,کورە تخوا تۆ شەرم لە خۆت ناکەی؟
+دروبوو اوقسه يه,درۆ بوو ئەو قسەیە
+ئه مه يه كم جارمه به بينم دعبا كوراني بلى كه ناشرينه,ئەمە یەکەم جارمە ببینم دەعبا گۆرانی بڵێ، کە ناشرینە!
+وةره خو نةكوشژة لةبةرئةم ريكلامه,وەرە خۆت مەکوژە لەبەر ئەم ڕیکلامە
+کە ناڵێ بێ رە وشتە کان,کەناڵی بێڕەوشتەکان.
+برؤ كن بروا ستايلي بابلؤكت بكا,بڕۆ کن بڕوا ستایلی با بلۆکت بکات
+عةزةلات فشةيه شةرتي جاو و بروويە,عەزەلات فشەیە، شەرتی چاو و برۆیە
+ام ريكلام بيتامه ش تواو نابي,ئەم ڕیک��امە بێتامەش تەواو نابێ
+دةستان خوش بيت هةر سةركةوتو بن,دەستتان خۆش بێت، هەر سەرکەوتوو بن
+وةلا عةينةن راستية امن بروام كرد,وەڵڵا ئەڵێی ڕاستییە، ئەمن بڕوام کرد.
+بوية كورد هةموي لةناو جوو كةسي تةندروست نماية,بۆیە کورد هەمووی لەناو چوو، کەسی تەندروست نەمایە
+باشترين راهينه ر له ميزووي توبي بي,باشترین ڕاهێنەر لە مێژووی تۆپی پێ
+ژن دوژمني ژنه له به رچي ژن قه بول ناكت پياو ژنيكي دي بينت,ژن دوژمنی ژنە، ژەبەرچی ژن قەبول ناکات پیاو ژنێکی تر بێنێت؟
+چاڤي به ميسي وتوه تؤله ي  ئيمه له ريال بكه وه,چاڤی بە مێسی وتوە: تۆڵەی ئێمە لە ڕیاڵ بکەوە
+هةموو روز دةموو جاو ئةم شتة ناشرينة ئةبيني, هەموو ڕۆژێ دەموو چاو ئەم شتە ناشرینە ئەبینی
+نيشانةي ژني خراپة,نیشانەی ژنی خراپە
+ده رئن اوه ي او كفتانه خاري له طه واريه,دەرێن ئەوەی ئەو کفتانەی خوارد لە تەوارییە
+شكلئ اوئ ديكه ي ده ت,شکڵی ئەوەی دیکە دەدات
+وةرزي تر من ارؤم من ببينن لة وي,وەرزێکی تر من ئەڕۆم، من ببینن لەوێ
+چه تاليشي پيه چاوه ري شوتي كاله كي بؤببه ن,چەتاڵیشی پێیە، چاوەڕێیە شووتی و کاڵەکی بۆ ببەن
+زورناخوشة دةني,زۆر ناخۆشە دەنگی
+كةواتة سى ريمةكة باتوشى گوناحنةبى,کەواتە سەیری مەکە باتوشی گوناح نەبیت
+دانا بةو زةعيفيةي خوى باشتره له تو,دانا بەو زەعیفییەی خۆی باشترە لە تو
+وه لله كجه كي قشتؤكه يه بي ده كري خؤشم ده وي من زور,وەڵڵا کچەکی قشتۆکەیە، پێی دەکرێ، خۆشم دەوێ من زۆر
+بةراستي بةرنامةكة ئةمجارة زوور جياوازة و جوانه,بەڕاستی بەرنامەکە ئەم جارە زۆر جیاوازە و جوانە
+سيناريوه فيشه ك ته قاندن ئاو هايه,سیناریۆیە، فیشەکتەقاندن ئەوهایە؟
+وةلا ئةوة هةمووي كوري فةقيرة شتيكي زؤر خراب اكةن,وەڵڵا ئەوە هەمووی کوڕی فەقیرە، شتێکی زۆر خراپ ئەکەن
+سودي أيوه جيه لؤ ميلت,سوودی ئێوە چییە بۆ میللەت؟

data/asosoft_benchmark.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/words.json ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ [
2	+
3	+ ]

eval.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# Evaluation
+python3 run_summarization.py \
+    --model_name_or_path "razhan/bart-kurd-spell-base" \
+    --do_eval \
+    --validation_file data/asosoft_benchmark.csv \
+    --output_dir /tmp \
+    --overwrite_output_dir \
+    --per_device_eval_batch_size=32 \
+    --predict_with_generate \
+    --logging_steps="1" \
+    --max_target_length=1024 \
+    --max_source_length=1024 \
+    --report_to="none"

inspect_data.ipynb ADDED Viewed

	@@ -0,0 +1,407 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import pandas as pd\n",
+    "from utils import  *\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rste = load_dataset(\"razhan/rste\", split=\"train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_rste = rste.to_pandas()\n",
+    "df = df_rste\n",
+    "pd.set_option('display.max_colwidth', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['clean_text'] = df['text'].apply(process_text)\n",
+    "\n",
+    "# df['contains_non_kurdish'] = df[\"text\"].apply(contains_non_kurdish_characters)\n",
+    "# print(df['contains_non_kurdish'].sum())\n",
+    "# df[df['contains_non_kurdish'] == True]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['clean_text'] = df['clean_text'].apply(keep_kurdish_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df[df['contains_non_kurdish'] == False]['clean_text'].to_csv(\"data/data.ckb.txt\", index=False, header=False)\n",
+    "df['clean_text'].to_csv(\"data/data.ckb.txt\", index=False, header=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[df['text'].str.contains('ھ')]\n",
+    "indices_with_substring = df[df['text'].str.contains('ھ')].index\n",
+    "# print(indices_with_substring)\n",
+    "df.loc[indices_with_substring]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_text = ''.join(df[\"text\"])\n",
+    "\n",
+    "unique_characters = set(all_text)\n",
+    "\n",
+    "print(\"Unique characters:\", unique_characters)\n",
+    "print(\"Number of unique characters:\", len(unique_characters))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_text = ''.join(df[\"clean_text\"])\n",
+    "# all_text = ''.join(df[df['contains_non_kurdish'] == False]['clean_text'])\n",
+    "unique_characters = set(all_text)\n",
+    "unique_punctuations = extract_punctuation(all_text)\n",
+    "print(\"Unique characters:\", unique_characters)\n",
+    "print(\"Number of unique characters:\", len(unique_characters))\n",
+    "print(\"Unique punctuations:\", unique_punctuations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['contains_non_kurdish'] = df[\"text\"].apply(contains_non_kurdish_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(unique_punctuations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df = pd.read_csv(\"asosoft_test_punc.csv\")\n",
+    "# df['summary'] = df['summary'].apply(process_text)\n",
+    "# df.to_csv(\"asosoft_test_clean.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oscar_dataset = load_dataset(\"oscar-corpus/OSCAR-2301\", language=\"ckb\", split='train', token=True)\n",
+    "wiki_dataset = load_dataset(\"wikipedia\", language=\"ckb\", date=\"20231120\", split='train', beam_runner='DirectRunner')\n",
+    "\n",
+    "df_oscar = oscar_dataset.to_pandas()\n",
+    "df_wiki = wiki_dataset.to_pandas()\n",
+    "df = pd.concat([df_oscar, df_wiki], ignore_index=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"text\"] = df[\"text\"].apply(process_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# text = df[\"text\"].str.cat(sep=\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['clean_text'] = df['text'].apply(keep_kurdish_characters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"clean_text\"] = df[\"clean_text\"].apply(process_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['contains_non_kurdish'] = df[\"clean_text\"].apply(contains_non_kurdish_characters)\n",
+    "print(df['contains_non_kurdish'].sum())\n",
+    "# df[df['contains_non_kurdish'] == True].iloc[0]['clean_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['repeated_ngram'] = df['clean_text'].apply(lambda x: contains_repeated_ngram(x, 10))\n",
+    "print(df['repeated_ngram'].sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop rows where repeated_ngram are True\n",
+    "df = df[df['repeated_ngram'] == False]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[df['repeated_ngram'] == True].iloc[0]['clean_text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_text = \"\".join(df[\"clean_text\"])\n",
+    "# all_text = ''.join(df[df['contains_non_kurdish'] == False]['clean_text'])\n",
+    "unique_characters = set(all_text)\n",
+    "unique_punctuations = extract_punctuation(all_text)\n",
+    "print(\"Unique characters:\", unique_characters)\n",
+    "print(\"Number of unique characters:\", len(unique_characters))\n",
+    "print(\"Unique punctuations:\", unique_punctuations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import sent_tokenize\n",
+    "data = []\n",
+    "for i, row in df.iterrows():\n",
+    "    sentences = tokenizer_ckb.sent_tokenize(row['clean_text'])\n",
+    "    # sentences = row['clean_text'].split('\\n')\n",
+    "    # sentences = sent_tokenize(row['clean_text'])\n",
+    "    sentences = [sent_tokenize(s) for s in sentences]\n",
+    "    # flatten list of lists\n",
+    "    sentences = [item for sublist in sentences for item in sublist]\n",
+    "    # split on period and keep the period\n",
+    "    sentences = [s.split('.') for s in sentences]\n",
+    "    sentences = [item for sublist in sentences for item in sublist]\n",
+    "\n",
+    "    sentences = [s + '.' for s in sentences]\n",
+    "    data.extend(sentences)\n",
+    "    # if i == 5:\n",
+    "    #     break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# longest line in data\n",
+    "max_line = max(data, key=len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(max_line.split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_line"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# calulate the length of each line in the data and take the average\n",
+    "lengths = [len(line.split()) for line in data]\n",
+    "avg_length = sum(lengths) / len(lengths)\n",
+    "print(avg_length)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# give me all the lines above 20 words\n",
+    "long_lines = [line for line in data if len(line.split()) > 25]\n",
+    "print(len(long_lines))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Write sentences to file\n",
+    "with open(\"data/oscar_wiki.ckb.txt\", \"w\") as f:\n",
+    "    for sentence in data:\n",
+    "        f.write(sentence + \"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

prepare_data/constants.py ADDED Viewed

	@@ -0,0 +1,25 @@

+ARABIC_CHARS = 'دصضذطكثنتالبيسجحإأآشظمغفقةىرؤءئزوخهع'
+KURDISH_CHARS = 'ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ'
+VALID_PUNCS = '\?؟\.\\\/,،«»\-:'
+ARABIC_PUCTUATIONS = "،؛۔٫٪؟"
+CKB_PUNCTUATIONS = "!.:;?،؛؟«»"
+NUMBERS = '٠١٢٣٤٥٦٧٨٩'
+SPECIAL = ' '
+NORMLIZER_MAPPER = {
+    'ﻹ': 'لإ',
+    'ﻷ': 'لأ',
+    'ﻵ': 'لآ',
+    'ﻻ': 'لا'
+}
+VALID_CHARS = KURDISH_CHARS + SPECIAL + NUMBERS + CKB_PUNCTUATIONS
+KEYBOARD_KEYS = [
+    'قوەرتیئحۆپ',
+    'اسدفگهژکل',
+    'زخجڤبنم'
+]
+KEYBOARD_BLANK = '_'

prepare_data/generate_dataset.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import random
+from string import punctuation
+import re
+import os
+from transformers import AutoTokenizer
+from tqdm import tqdm
+from typing import List
+from constants import KURDISH_CHARS, KEYBOARD_BLANK, KEYBOARD_KEYS, NUMBERS
+def tokenizer_check_if_text_too_long(text, tokenizer, max_length):
+    data = tokenizer.batch_encode_plus([text],max_length=max_length,truncation=True,return_overflowing_tokens=True )
+    if len(data["input_ids"]) > 1:
+        return True
+    else:
+        return False#, len(data["input_ids"][0])
+def delete_characters(text, char_delete_percentage=0.01):
+    modifyed_line = []
+    for char in text:
+        if random.random() > char_delete_percentage or char in NUMBERS:
+            modifyed_line.append(char)
+    return "".join(modifyed_line)
+def insert_characters(text, augmentation_probability=0.01):
+    modifyed_line = []
+    for char in text:
+        if random.random() <= augmentation_probability and char not in NUMBERS:
+            modifyed_line.append(random.choice(KURDISH_CHARS))
+        modifyed_line.append(char)
+    return "".join(modifyed_line)
+def replace_characters(text, augmentation_probability=0.01):
+    modifyed_line = []
+    for char in text:
+        if random.random() <= augmentation_probability and char not in NUMBERS:
+            modifyed_line.append(random.choice(KURDISH_CHARS))
+        else:
+            modifyed_line.append(char)
+    return "".join(modifyed_line)
+def random_neighbor_replace(line: str, keyboard_rows: List[str], blank: str) -> str:
+    lines = keyboard_rows
+    n_rows = len(keyboard_rows)
+    _mapper = {}
+    def __get_left(row_idx: int, col_idx: int) -> List[str]:
+        if col_idx == 0:
+            return []
+        return [lines[row_idx][col_idx - 1]]
+    def __get_right(row_idx: int, col_idx: int) -> List[str]:
+        if col_idx == (len(lines[row_idx]) - 1):
+            return []
+        return lines[row_idx][col_idx + 1]
+    def __get_upper(row_idx: int, col_idx: int) -> List[str]:
+        if row_idx == 0:
+            return []
+        line = lines[row_idx - 1]
+        start = max(0, col_idx - 1)
+        end = min(len(line), col_idx + 2)
+        return list(line[start: end])
+    def __get_lower(row_idx: int, col_idx: int) -> List[str]:
+        if row_idx == (n_rows - 1):
+            return []
+        line = lines[row_idx + 1]
+        start = max(0, col_idx - 1)
+        end = min(len(line), col_idx + 2)
+        return list(line[start: end])
+    funcs = [__get_left, __get_right, __get_upper, __get_lower]
+    for row_idx in range(n_rows):
+        for col_idx in range(len(lines[row_idx])):
+            items = []
+            for func in funcs:
+                items.extend(func(row_idx, col_idx))
+            items = list(filter(lambda x: x != blank, items))
+            char = lines[row_idx][col_idx]
+            _mapper[char] = items.copy()
+    def get_char(char: str) -> str:
+        if char not in _mapper:
+            return char
+        return random.choice(_mapper[char])
+    length = len(line)
+    if length == 0:
+        length = 1
+    idx = random.randint(0, length - 1)
+    return line[:idx] + get_char(line[idx]) + line[idx + 1:]
+def lower_case_words(text, augmentation_probability=0.5):
+    modifyed_line = []
+    for word in text.split():
+        if word[0].islower() == False and random.random() <= augmentation_probability:
+            word = word.lower()
+        modifyed_line.append(word)
+    return " ".join(modifyed_line)
+clean_chars = re.compile(r'[^A-Za-zöäüÖÄÜß,.!?’\'$%€0-9\(\)\- ]', re.MULTILINE)
+def cleanup(text):
+    text = clean_chars.sub('', text)
+    #print("bug: somehow all numbers are removed - this is might be due to this regex")
+    #exit()
+    #text = text.replace("\n", "")
+    #text = text.replace('"','\\"')
+    return text
+clean_punctuation = re.compile(r"(?<!\d)[.,;:'?؟.!()؟،»«](?!\d)")
+def remove_punctuation(text):
+    """Remove all punctuation from string, except if it's between NUMBERS"""
+    return clean_punctuation.sub("", text)
+def combine_sentences(text, sentences, augmentation_probability = 1):
+    if random.random() < augmentation_probability:
+        sentences_to_sample = random.randint(0,10)
+        augmentation_sentences = random.sample(sentences,sentences_to_sample)
+        return text + " " + " ".join(augmentation_sentences)
+    else:
+        return text
+def delete_word(text, augmentation_probability = 0.001):
+    if random.random() < augmentation_probability:
+        words = text.split()
+        if len(words) < 3:
+            # do not delete word in short text, as there will be no context to guess the word
+            return text
+        word_to_remove = random.randint(0,len(words)-1)
+        words.pop(word_to_remove)
+        return " ".join(words)
+    else:
+        return text
+if __name__ == "__main__":
+    data_file = "data/data.txt" #"data/en.wikidump.processed.24m.txt" #
+    language = "ckb" # "wikidump.24m.en"
+    num_lines = sum(1 for line in open(data_file,'r'))
+    print("Number of lines:",num_lines)
+    with open(data_file,'r') as file:
+        sentences = file.readlines(int(num_lines*0.5))
+        # sentences = [cleanup(sentence) for sentence in sentences]
+    # tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
+    tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
+    with open(language+".csv","w",encoding='utf-8') as output:
+        with open(data_file,'r') as file:
+            for line in tqdm(file, total=num_lines):
+                # line = cleanup(line)
+                if len(line) < 1:
+                    continue
+                line = combine_sentences(line,sentences)
+                if tokenizer_check_if_text_too_long(line,tokenizer,max_length=1024):
+                    print(f"skipping line as its too long ({len(line)}):\n"+line)
+                    continue
+                if random.random() >0.02:
+                    # we will leave 2% of the data untouched, to teach the
+                    # model, not to "overact" on the texts
+                    new_line = delete_word(line)
+                    new_line = delete_characters(new_line)
+                    new_line = insert_characters(new_line)
+                    new_line = replace_characters(new_line)
+                    new_line = random_neighbor_replace(new_line, KEYBOARD_KEYS, KEYBOARD_BLANK)
+                    new_line = remove_punctuation(new_line)
+                else:
+                    new_line = line
+                output.write(f'"{new_line.strip()}","{line.strip()}"\n')
+    os.system(f"echo \"text,summary\" > {language}.train.csv")
+    num_lines = sum(1 for line in open(f"{language}.csv",'r'))
+    os.system(f"head -n {num_lines-2000} {language}.csv >> {language}.train.csv")
+    os.system(f"echo \"text,summary\" > {language}.test.csv")
+    os.system(f"tail -n 2000 {language}.csv >> {language}.test.csv")

prepare_data/helpers.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from functools import lru_cache
+import json
+import math
+import re
+from typing import List, Union
+from pathlib import Path
+import torch
+from torch import Tensor
+def load_text_file(
+        file_path: Union[Path, str],
+        encoding='utf-8',
+        *args, **kwargs
+        ) -> str:
+    with open(file_path, 'r', encoding=encoding) as f:
+        data = f.read()
+    return data
+def save_text_file(
+        file_path: Union[Path, str],
+        data: str,
+        encoding='utf-8'
+        ) -> str:
+    with open(file_path, 'w', encoding=encoding) as f:
+        data = f.write(data)
+    return data
+def remove_long_spaces(line: str) -> str:
+    return re.sub('\s{2,}', ' ', line)
+@lru_cache(maxsize=2)
+def get_positionals(max_length: int, d_model: int) -> Tensor:
+    """Create Positionals tensor to be added to the input
+    Args:
+        max_length (int): The maximum length of the positionals sequence.
+        d_model (int): The dimensionality of the positionals sequence.
+    Returns:
+        Tensor: Positional tensor
+    """
+    result = torch.zeros(max_length, d_model, dtype=torch.float)
+    for pos in range(max_length):
+        for i in range(0, d_model, 2):
+            denominator = pow(10000, 2 * i / d_model)
+            result[pos, i] = math.sin(pos / denominator)
+            result[pos, i + 1] = math.cos(pos / denominator)
+    return result
+def load_json(file_path: Union[Path, str]) -> Union[dict, list]:
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+def save_json(
+        file_path: Union[Path, str], data: Union[dict, list]
+        ) -> None:
+    with open(file_path, 'w') as f:
+        json.dump(data, f)
+def get_freq_dict(data: List[str]) -> dict:
+    freq = {}
+    for item in data:
+        for word in item.split(' '):
+            if word in freq:
+                freq[word] += 1
+            else:
+                freq[word] = 1
+    return freq
+def load_state(state_path: Union[Path, str]):
+    state = torch.load(state_path)
+    model = state['model']
+    model = {
+        key.replace('module.', ''): value
+        for key, value in model.items()
+        }
+    optimizer = state['optimizer']
+    epoch = state['epoch']
+    steps = state['steps']
+    return model, optimizer, epoch, steps

prepare_data/interfaces.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from abc import ABC, abstractmethod, abstractproperty
+class IProcess(ABC):
+    @abstractmethod
+    def execute():
+        pass
+class IProcessor(ABC):
+    @abstractmethod
+    def run():
+        pass
+    @abstractmethod
+    def dist_run():
+        pass
+class ITokenizer(ABC):
+    @abstractmethod
+    def ids2tokens(self):
+        pass
+    @abstractmethod
+    def tokenize(self):
+        pass
+    @abstractmethod
+    def set_tokenizer(self):
+        pass
+    @abstractmethod
+    def save_tokenizer(self):
+        pass
+    @abstractmethod
+    def load_tokenizer(self):
+        pass
+    @abstractmethod
+    def add_token(self):
+        pass
+    @abstractmethod
+    def preprocess_tokens(self):
+        pass
+    @abstractmethod
+    def batch_tokenizer(self):
+        pass
+    @abstractproperty
+    def vocab_size(self):
+        pass
+    @abstractmethod
+    def get_tokens(self):
+        pass
+class ILogger(ABC):
+    @abstractmethod
+    def log_step():
+        pass
+    @abstractmethod
+    def log():
+        pass
+    @abstractmethod
+    def set_rank():
+        pass
+    @abstractmethod
+    def log_img():
+        pass
+class IPredictor(ABC):
+    @abstractmethod
+    def predict():
+        pass

prepare_data/process_data.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from argparse import ArgumentParser
+import os
+import re
+import time
+from processors import FilesProcessor, get_text_distorter
+from processes import (
+    CharsRemover,
+    LengthFilter,
+    LinesSplitter,
+    LoadFile,
+    NumbersFilter,
+    OOVFilter,
+    RepeatedCharsCollapsor,
+    # SoloCharFilter,
+    SpacesRemover,
+    ValidCharsKeeper,
+    WordsFilter,
+    WordsNumberFilter,
+    CharsNormalizer,
+    TokenizerLengthFilter,
+    )
+from helpers import load_json, save_text_file
+from typing import Union, List
+from pathlib import Path
+import constants
+import pandas as pd
+def get_paths(
+        main_dir: Union[Path, str]
+        ) -> List[Union[Path, str]]:
+    paths = [
+        os.path.join(main_dir, file)
+        for file in os.listdir(main_dir)
+        ]
+    return paths
+def get_path(
+        file_path: Union[Path, str]
+        ) -> List[Union[Path, str]]:
+    if os.path.isfile(file_path):
+        return [file_path]
+    else:
+        raise FileNotFoundError
+def get_file_processor(args):
+    words = load_json(args.execlude_words_files)
+    processes = [
+        LoadFile(),
+        *[LinesSplitter(sep=sep) for sep in args.sep],
+        RepeatedCharsCollapsor(args.max_rep_chars),
+        NumbersFilter(),
+        # SoloCharFilter(),
+        WordsFilter(words),
+        ValidCharsKeeper(constants.VALID_CHARS),
+        SpacesRemover(),
+        WordsNumberFilter(args.min_words, args.max_words),
+        # TokenizerLengthFilter(),
+        LengthFilter(args.min_len, args.max_len)
+    ]
+    return FilesProcessor(processes)
+def post_process(data: List[str]) -> List[str]:
+    lines = []
+    for item in data:
+        lines.extend(item)
+    lines = list(set(lines))
+    # lines = OOVFilter(args.max_oov).execute(lines)
+    return lines
+clean_punctuation = re.compile(r"(?<!\d)[!.:;?،؛؟«» ،؛۔٫٪؟](?!\d)")
+def remove_punctuation(text):
+    """Remove all punctuation from string, except if it's between digits"""
+    return clean_punctuation.sub("", text)
+def get_argparser():
+    parser = ArgumentParser()
+    parser.add_argument(
+        '--sep', default=[
+            '\n',
+            #   '\t', '.', '،', ',', '=', ':', '-', '\\', '/'
+            ], nargs='+', type=str,
+        help='The seperator to be used to split the lines on'
+        )
+    parser.add_argument(
+        '--min_len', default=5, type=int,
+        help='The minimum line length to keep'
+        )
+    parser.add_argument(
+        '--max_len', default=1020, type=int,
+        help='The maximum line length to keep'
+        )
+    parser.add_argument(
+        '--dist_run', default=False, action='store_true'
+    )
+    parser.add_argument(
+        '--data_path', default='data/data.txt'
+    )
+    parser.add_argument(
+        '--save_path', default='data/clean_data.txt'
+    )
+    parser.add_argument(
+        '--max_rep_chars', default=2
+    )
+    parser.add_argument(
+        '--execlude_words_files', default='data/words.json'
+    )
+    parser.add_argument(
+        '--max_oov', default=100, type=int
+    )
+    parser.add_argument(
+        '--min_words', default=3, type=int
+    )
+    parser.add_argument(
+        '--max_words', default=100, type=int
+    )
+    parser.add_argument(
+        '--dist_ratios', default=[0.05, 0.1, 0.15]
+    )
+    parser.add_argument(
+        '--remove_punc', default=False, action='store_true', help='Remove punctuation of the distorted lines'
+    )
+    return parser
+def main(args) -> None:
+    fp = get_file_processor(args)
+    files = get_path(args.data_path)
+    print('Started!')
+    start = time.time()
+    if args.dist_run is True:
+        print('dist run')
+        data = fp.dist_run(files)
+    else:
+        data = fp.run(files)
+    end = time.time()
+    print(f'Files Processing completed in {end - start}')
+    data = post_process(data)
+    sentences = data[: len(data) // 2]
+    print("Length of data after post processing", len(data))
+    df = None
+    for i, ratio in enumerate(args.dist_ratios):
+        distorter = get_text_distorter(ratio, sentences)
+        # TODO: Don't touch 2 percent of sentences to keep the model from having a high bias towards the noise
+        dist = list(map(distorter.run, data))
+        if df is None:
+            df = pd.DataFrame({
+                'clean': data,
+                f'distorted_{ratio}': dist
+            })
+        else:
+            df[f'distorted_{ratio}'] = dist
+    if args.remove_punc is True:
+        print("Removing punctuations for the distorted lines")
+        for ratio in args.dist_ratios:
+            df[f'distorted_{ratio}'] = df[f'distorted_{ratio}'].apply(
+                remove_punctuation
+            )
+    df.to_csv(f'data/data.csv', encoding='utf-8')
+    # save_text_file(args.save_path, '\n'.join(data))
+if __name__ == '__main__':
+    parser = get_argparser()
+    args = parser.parse_args()
+    main(args)
+    num_lines = sum(1 for line in open(f"data/data.csv",'r'))
+    os.system(f"echo \"text,summary\" > train.csv")
+    # # Only change the first $ variable for different distortion ratios
+    # os.system(f"awk -F',' 'NR>1 && NR<={num_lines-50000} {{print $4 \",\" $2}}' data/data.csv >> train.csv")
+    # os.system(f"awk -F',' 'NR>1 && NR<={num_lines-50000} {{print $3 \",\" $2}}' data/data.csv >> train.csv")
+    os.system(f"awk -F',' 'NR>1 && NR<={num_lines-50000} {{print $5 \",\" $2}}' data/data.csv | sed 's/\"//g' >> train.csv")
+    os.system(f"awk -F',' 'NR>1 && NR<={num_lines-50000} {{print $4 \",\" $2}}' data/data.csv | sed 's/\"//g' >> train.csv")
+    os.system(f"awk -F',' 'NR>1 && NR<={num_lines-50000} {{print $3 \",\" $2}}' data/data.csv | sed 's/\"//g' >> train.csv")
+    os.system(f"echo \"text,summary\" > test.csv")
+    # os.system(f"tail -n 50000 data/data.csv | awk -F',' '{{print $4 \",\" $2}}' >> test.csv")
+    # os.system(f"tail -n 50000 data/data.csv | awk -F',' '{{print $3 \",\" $2}}' >> test.csv")
+    os.system(f"awk -F',' 'NR>{num_lines-50000} {{print $5 \",\" $2}}' data/data.csv | sed 's/\"//g' >> test.csv")
+    os.system(f"awk -F',' 'NR>{num_lines-50000} {{print $4 \",\" $2}}' data/data.csv | sed 's/\"//g' >> test.csv")
+    os.system(f"awk -F',' 'NR>{num_lines-50000} {{print $3 \",\" $2}}' data/data.csv | sed 's/\"//g' >> test.csv")

prepare_data/processes.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import random
+import re
+from typing import List, Union
+from interfaces import IProcess
+from helpers import get_freq_dict, load_text_file, remove_long_spaces
+from transformers import AutoTokenizer
+class LoadFile(IProcess):
+    def execute(self, file_path: str):
+        return load_text_file(
+            file_path
+            )
+class LinesSplitter(IProcess):
+    def __init__(self, sep: str) -> None:
+        super().__init__()
+        self.sep = sep
+    def split(self, line):
+        return line.split(self.sep)
+    def execute(self, data: Union[List[str], str]) -> List[str]:
+        if isinstance(data, str):
+            return data.split(self.sep)
+        results = []
+        for lines in map(self.split, data):
+            results.extend(lines)
+        return results
+class LengthFilter(IProcess):
+    def __init__(
+            self, min_length: int, max_length: int
+            ) -> None:
+        super().__init__()
+        self.min_length = min_length
+        self.max_length = max_length
+    def execute(self, lines: List[str]):
+        return list(filter(
+            lambda x: self.min_length <= len(x) <= self.max_length, lines
+            ))
+class WordsNumberFilter(IProcess):
+    def __init__(self, min_words: int, max_words: int) -> None:
+        super().__init__()
+        self.min_words = min_words
+        self.max_words = max_words
+    def _is_valid(self, line: str) -> bool:
+        return self.min_words < line.count(' ') < self.max_words
+    def execute(self, lines: List[str]):
+        return list(filter(self._is_valid, lines))
+class TokenizerLengthFilter(IProcess):
+    def __init__(self, max_length: int = 1024) -> None:
+        super().__init__()
+        self.max_length = max_length
+        self.tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
+    def _is_valid(self, line: str) -> bool:
+        data = self.tokenizer.batch_encode_plus([line], max_length=self.max_length, truncation=True,return_overflowing_tokens=True )
+        if len(data["input_ids"]) > 1:
+            return True
+        else:
+            return False
+    def execute(self, lines: List[str]):
+        return list(filter(self._is_valid, lines))
+class WordsFilter(IProcess):
+    def __init__(self, words: List[str]) -> None:
+        super().__init__()
+        self.words = set(words)
+    def _not_contain(self, line: str) -> bool:
+        return not any((
+            word in line for word in self.words
+            ))
+    def execute(self, lines: List[str]):
+        return list(filter(self._not_contain, lines))
+class SoloCharFilter(IProcess):
+    def _not_contain(self, line: str) -> bool:
+        return re.search('^. | . | .$', line) is None
+    def execute(self, lines: List[str]):
+        return list(filter(self._not_contain, lines))
+class NumbersFilter(IProcess):
+    def _not_contain(self, line: str) -> bool:
+        return re.search('[0-9]+', line) is None
+    def execute(self, lines: List[str]):
+        return list(filter(self._not_contain, lines))
+class OOVFilter(IProcess):
+    def __init__(self, max_oov: int) -> None:
+        super().__init__()
+        self.max_oov = max_oov
+        self.__freq = {}
+    def _is_valid(self, line: str):
+        counter = 0
+        for word in line.split(' '):
+            counter += (self.__freq[word] == 1)
+        return counter < self.max_oov
+    def execute(self, lines: List[str]):
+        self.__freq = get_freq_dict(lines)
+        return list(filter(self._is_valid, lines))
+# text = ["کوردستان وڵاتی کوردانە هەی هەی هەی هەی", "کورد بوون گەوادیە", "ژیان سەختە"]
+# result = OOVFilter(5).execute(text)
+# print(result)
+class CharsRemover(IProcess):
+    def __init__(self, chars: str) -> None:
+        super().__init__()
+        self.pat = f'[{chars}]'
+    def remove(self, line: str) -> str:
+        return re.sub(self.pat, '', line)
+    def execute(self, lines: List[str]) -> List[str]:
+        return list(map(self.remove, lines))
+class RepeatedCharsCollapsor(IProcess):
+    def __init__(self, max_repeteion: int) -> None:
+        super().__init__()
+        self.pat = r"(.)\1{}".format(f"{{{2},}}")
+    def collaps(self, line: str) -> str:
+        return re.sub(self.pat, r"\1" * 1, line)
+    def execute(self, lines: List[str]) -> List[str]:
+        return list(map(self.collaps, lines))
+class ValidCharsKeeper(IProcess):
+    def __init__(self, valid_chars: str, rep_with=' ') -> None:
+        super().__init__()
+        self.valid_chars = valid_chars
+        self.rep_with = rep_with
+        self.pat = f'[^{self.valid_chars}]'
+    def __keep(self, line: str) -> str:
+        return re.sub(self.pat, ' ', line)
+    def execute(self, lines: List[str]) -> List[str]:
+        return list(map(self.__keep, lines))
+class SpacesRemover(IProcess):
+    def __remove(self, line: str) -> str:
+        return remove_long_spaces(line).strip()
+    def execute(self, lines: List[str]):
+        return list(map(self.__remove, lines))
+class RandomCharsInjector(IProcess):
+    def __init__(self, chars: str) -> None:
+        super().__init__()
+        self.chars = chars
+    def get_char(self) -> str:
+        return random.choice(self.chars)
+    def execute(self, line: str):
+        length = len(line)
+        idx = random.randint(0, length - 1)
+        return line[:idx] + self.get_char() + line[idx:]
+class PunctuationRemover(IProcess):
+    def __init__(self) -> None:
+        super().__init__()
+        self.clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!،.؟؛:»«](?!\d)")
+    def __remove_punctuation(self, text: str):
+        """Remove all punctuation from string, except if it's between digits"""
+        return self.clean_punctuation.sub("", text)
+    def execute(self, line: str):
+        return self.__remove_punctuation(line)
+class RandomCharsSwapper(IProcess):
+    def execute(self, line: str) -> str:
+        length = len(line)
+        idx = random.randint(0, length - 2)
+        return line[:idx] + line[idx + 1] + line[idx] + line[idx + 2:]
+class RandomCharRemover(IProcess):
+    def execute(self, line: str) -> str:
+        length = len(line)
+        idx = random.randint(0, length - 1)
+        return line[:idx] + line[idx + 1:]
+class RandomWordsCollapsor(IProcess):
+    def execute(self, line: str) -> str:
+        indices = [
+            i for i, char in enumerate(line)
+            if char == ' '
+            ]
+        if len(indices) == 0:
+            return line
+        idx = random.choice(indices)
+        return line[: idx] + line[idx + 1:]
+class RandomNeighborReplacer(IProcess):
+    def __init__(self, keyboard_rows: List[str], blank: str) -> None:
+        super().__init__()
+        self.lines = keyboard_rows
+        self.blank = blank
+        self.n_rows = len(keyboard_rows)
+        self._mapper = {}
+        self.set_mapper()
+    def __get_left(
+            self, row_idx: int, col_idx: int
+            ) -> List[str]:
+        if col_idx == 0:
+            return []
+        return [self.lines[row_idx][col_idx - 1]]
+    def __get_right(
+            self, row_idx: int, col_idx: int
+            ) -> List[str]:
+        if col_idx == (len(self.lines[row_idx]) - 1):
+            return []
+        return self.lines[row_idx][col_idx + 1]
+    def __get_upper(
+            self, row_idx: int, col_idx: int
+            ) -> List[str]:
+        if row_idx == 0:
+            return []
+        line = self.lines[row_idx - 1]
+        start = max(0, col_idx - 1)
+        end = min(len(line), col_idx + 2)
+        return list(line[start: end])
+    def __get_lower(
+            self, row_idx: int, col_idx: int
+            ) -> List[str]:
+        if row_idx == (self.n_rows - 1):
+            return []
+        line = self.lines[row_idx + 1]
+        start = max(0, col_idx - 1)
+        end = min(len(line), col_idx + 2)
+        return list(line[start: end])
+    def set_mapper(self) -> None:
+        funcs = [
+            self.__get_left,
+            self.__get_right,
+            self.__get_upper,
+            self.__get_lower
+        ]
+        for row_idx in range(self.n_rows):
+            for col_idx in range(len(self.lines[row_idx])):
+                items = []
+                for func in funcs:
+                    items.extend(func(row_idx, col_idx))
+                items = list(
+                    filter(lambda x: x != self.blank, items)
+                    )
+                char = self.lines[row_idx][col_idx]
+                self._mapper[char] = items.copy()
+    def get_char(self, char: str) -> str:
+        if char not in self._mapper:
+            return char
+        return random.choice(self._mapper[char])
+    def execute(self, line: str) -> str:
+        length = len(line)
+        idx = random.randint(0, length - 1)
+        return line[:idx] + self.get_char(line[idx]) + line[idx + 1:]
+class CharsNormalizer(IProcess):
+    def __init__(self, mapper: dict) -> None:
+        super().__init__()
+        self.mapper = mapper
+    def _normalize(self, line: str) -> str:
+        for key, value in self.mapper.items():
+            line = line.replace(key, value)
+        return line
+    def execute(self, lines: List[str]):
+        return list(filter(self._normalize, lines))
+class SentencePermutation(IProcess):
+    def __init__(self, sentences: List[str], augmentation_probability: float = 1) -> None:
+        super().__init__()
+        self.sentences = sentences
+        self.augmentation_probability = augmentation_probability
+    def _combine(self, text: str) -> str:
+        if random.random() < self.augmentation_probability:
+            sentences_to_sample = random.randint(0,10)
+            augmentation_sentences = random.sample(self.sentences, sentences_to_sample)
+            return text + " " + " ".join(augmentation_sentences)
+        else:
+            return text
+    def execute(self, line: str) -> str:
+        # return [self._combine(line) for line in lines]
+        return self._combine(line)

prepare_data/processors.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from threading import Thread
+import constants
+from pathlib import Path
+import random
+from typing import Union, Any, List
+from interfaces import IProcess, IProcessor
+from processes import (
+    RandomCharRemover,
+    RandomCharsInjector,
+    RandomCharsSwapper,
+    RandomNeighborReplacer,
+    RandomWordsCollapsor,
+    PunctuationRemover,
+    SentencePermutation,
+)
+class FilesProcessor(IProcessor):
+    def __init__(
+            self, processes: List[IProcess],
+            n_dist: int = 32
+            ) -> None:
+        self.processes = processes
+        self.n_dist = n_dist
+        self.__dist = False
+        self.__cache = []
+    def file_run(self, file: Union[str, Path]) -> Any:
+        result = file
+        for process in self.processes:
+            result = process.execute(result)
+        return result
+    def run(
+            self,
+            files: List[Union[str, Path]]
+            ) -> Any:
+        result = list(map(self.file_run, files))
+        if self.__dist is True:
+            self.__cache.append(result)
+            return
+        return result
+    def _divde(self, data: List[Any]):
+        items_per_div = len(data) // self.n_dist
+        divs = []
+        for i in range(items_per_div):
+            start = i * items_per_div
+            end = (i + 1) * items_per_div
+            if i == (items_per_div - 1):
+                end = len(divs)
+            divs.append(data[start: end])
+        return divs
+    def dist_run(
+            self,
+            files: List[Union[str, Path]]
+            ) -> Any:
+        self.__dist = True
+        self.__cache = []
+        divs = self._divde(files)
+        threads = []
+        for div in divs:
+            t = Thread(target=self.run, args=(div,))
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()
+        self.__dist = False
+        results = []
+        for item in self.__cache:
+            results.extend(item)
+        self.__cache = []
+        return results
+class TextDistorter(IProcessor):
+    def __init__(
+            self, ratio: float, processes: List[IProcess]
+            ) -> None:
+        super().__init__()
+        self.ratio = ratio
+        self.processes = processes
+    def run(self, line: str) -> str:
+        length = len(line)
+        n = int(self.ratio * length)
+        for _ in range(n):
+            line = random.choice(self.processes).execute(line)
+        return line
+    def dist_run(self):
+        # TODO
+        pass
+class TextProcessor(IProcessor):
+    def __init__(self, processes: List[IProcess]) -> None:
+        super().__init__()
+        self.processes = processes
+    def run(self, sentence: str):
+        for process in self.processes:
+            sentence = process.execute(sentence)
+        return sentence
+    def dist_run(self, sentence: str) -> str:
+        return self.run(sentence)
+def get_text_distorter(ratio, sentences: List[str]):
+    return TextDistorter(
+        ratio=ratio,
+        processes=[
+            SentencePermutation(sentences),
+            RandomCharsInjector(constants.KURDISH_CHARS),
+            RandomCharsSwapper(),
+            RandomCharRemover(),
+            RandomWordsCollapsor(),
+            RandomNeighborReplacer(
+                constants.KEYBOARD_KEYS, constants.KEYBOARD_BLANK
+                )
+        ]
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
+nltk
+py7zr
+torch >= 2.0.1
+evaluate
+jiwer

run_summarization.py ADDED Viewed

	@@ -0,0 +1,793 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for sequence to sequence.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+import logging
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Optional
+import datasets
+import evaluate
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
+from datasets import load_dataset
+from filelock import FileLock
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    MBart50Tokenizer,
+    MBart50TokenizerFast,
+    MBartTokenizer,
+    MBartTokenizerFast,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.33.0.dev0")
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
+logger = logging.getLogger(__name__)
+try:
+    nltk.data.find("tokenizers/punkt")
+except (LookupError, OSError):
+    if is_offline_mode():
+        raise LookupError(
+            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
+        )
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+# A list of all multilingual tokenizer which require lang attribute.
+MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    resize_position_embeddings: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
+                "the model's position embeddings."
+            )
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    text_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
+    )
+    summary_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
+            )
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total sequence length for target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+                "during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to pad all samples to model maximum sentence length. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+                "efficient on GPU but very bad for TPU."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+                "which is used during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+    source_prefix: Optional[str] = field(
+        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
+    )
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to force as the first generated token after the decoder_start_token_id."
+                "Useful for multilingual models like mBART where the first generated token"
+                "needs to be the target language token (Usually it is the target language token)"
+            )
+        },
+    )
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training, validation, or test file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+        if self.val_max_target_length is None:
+            self.val_max_target_length = self.max_target_length
+summarization_name_mapping = {
+    "amazon_reviews_multi": ("review_body", "review_title"),
+    "big_patent": ("description", "abstract"),
+    "cnn_dailymail": ("article", "highlights"),
+    "orange_sum": ("text", "summary"),
+    "pn_summary": ("article", "summary"),
+    "psc": ("extract_text", "summary_text"),
+    "samsum": ("dialogue", "summary"),
+    "thaisum": ("body", "summary"),
+    "xglue": ("news_body", "news_title"),
+    "xsum": ("document", "summary"),
+    "wiki_summary": ("article", "highlights"),
+    "multi_news": ("document", "summary"),
+}
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if model_args.use_auth_token is not None:
+        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        if model_args.token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        model_args.token = model_args.use_auth_token
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_summarization", model_args, data_args)
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+    if data_args.source_prefix is None and model_args.model_name_or_path in [
+        "t5-small",
+        "t5-base",
+        "t5-large",
+        "t5-3b",
+        "t5-11b",
+    ]:
+        logger.warning(
+            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
+            "`--source_prefix 'summarize: ' `"
+        )
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
+    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            token=model_args.token,
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(
+            extension,
+            data_files=data_files,
+            cache_dir=model_args.cache_dir,
+            token=model_args.token,
+        )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=model_args.token,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
+        if isinstance(tokenizer, MBartTokenizer):
+            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.lang]
+        else:
+            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.lang)
+    if model.config.decoder_start_token_id is None:
+        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+    if (
+        hasattr(model.config, "max_position_embeddings")
+        and model.config.max_position_embeddings < data_args.max_source_length
+    ):
+        if model_args.resize_position_embeddings is None:
+            logger.warning(
+                "Increasing the model's number of position embedding vectors from"
+                f" {model.config.max_position_embeddings} to {data_args.max_source_length}."
+            )
+            model.resize_position_embeddings(data_args.max_source_length)
+        elif model_args.resize_position_embeddings:
+            model.resize_position_embeddings(data_args.max_source_length)
+        else:
+            raise ValueError(
+                f"`--max_source_length` is set to {data_args.max_source_length}, but the model only has"
+                f" {model.config.max_position_embeddings} position encodings. Consider either reducing"
+                f" `--max_source_length` to {model.config.max_position_embeddings} or to automatically resize the"
+                " model's position encodings by passing `--resize_position_embeddings`."
+            )
+    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        column_names = raw_datasets["validation"].column_names
+    elif training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        column_names = raw_datasets["test"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+        return
+    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
+        assert (
+            data_args.lang is not None
+        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        tokenizer.src_lang = data_args.lang
+        tokenizer.tgt_lang = data_args.lang
+        # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
+        # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
+        forced_bos_token_id = (
+            tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None
+        )
+        model.config.forced_bos_token_id = forced_bos_token_id
+    # Get the column names for input/target.
+    dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
+    if data_args.text_column is None:
+        text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        text_column = data_args.text_column
+        if text_column not in column_names:
+            raise ValueError(
+                f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if data_args.summary_column is None:
+        summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        summary_column = data_args.summary_column
+        if summary_column not in column_names:
+            raise ValueError(
+                f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    # Temporarily set max_target_length for training.
+    max_target_length = data_args.max_target_length
+    padding = "max_length" if data_args.pad_to_max_length else False
+    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
+        logger.warning(
+            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
+            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
+        )
+    def preprocess_function(examples):
+        # remove pairs where at least one record is None
+        inputs, targets = [], []
+        for i in range(len(examples[text_column])):
+            if examples[text_column][i] and examples[summary_column][i]:
+                inputs.append(examples[text_column][i])
+                targets.append(examples[summary_column][i])
+        inputs = [prefix + inp for inp in inputs]
+        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+    if training_args.do_train:
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
+    if training_args.do_eval:
+        max_target_length = data_args.val_max_target_length
+        eval_dataset = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
+    if training_args.do_predict:
+        max_target_length = data_args.val_max_target_length
+        predict_dataset = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
+            predict_dataset = predict_dataset.select(range(max_predict_samples))
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )
+    # Metric
+    cer = evaluate.load("cer")
+    wer = evaluate.load("wer")
+    bleu = evaluate.load("bleu")
+    chrf = evaluate.load("chrf")
+    def postprocess_text(preds, labels):
+        preds = [pred.strip() for pred in preds]
+        labels = [label.strip() for label in labels]
+        # rougeLSum expects newline after each sentence
+        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
+        return preds, labels
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        if isinstance(preds, tuple):
+            preds = preds[0]
+        # Replace -100s used for padding as we can't decode them
+        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # Some simple post-processing
+        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+        result = {}
+        result['cer'] = cer.compute(predictions=decoded_preds, references=decoded_labels)
+        result['wer'] = wer.compute(predictions=decoded_preds, references=decoded_labels)
+        result['bleu'] = bleu.compute(predictions=decoded_preds, references=decoded_labels)['bleu']
+        result['chrF'] = chrf.compute(predictions=decoded_preds, references=decoded_labels)['score']
+        result = {k: v if k == 'chrF' else round(v * 100, 4) for k, v in result.items()}
+        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+        result["gen_len"] = np.mean(prediction_lens)
+        return result
+    # Override the decoding parameters of Seq2SeqTrainer
+    training_args.generation_max_length = (
+        training_args.generation_max_length
+        if training_args.generation_max_length is not None
+        else data_args.val_max_target_length
+    )
+    training_args.generation_num_beams = (
+        data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
+    )
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+    )
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        if isinstance(eval_dataset, dict):
+            metrics = {}
+            for eval_ds_name, eval_ds in eval_dataset.items():
+                dataset_metrics = trainer.evaluate(eval_dataset=eval_ds, metric_key_prefix=f"eval_{eval_ds_name}")
+                metrics.update(dataset_metrics)
+        else:
+            metrics = trainer.evaluate(metric_key_prefix="eval")
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict")
+        metrics = predict_results.metrics
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+        if trainer.is_world_process_zero():
+            if training_args.predict_with_generate:
+                predictions = predict_results.predictions
+                predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
+                predictions = tokenizer.batch_decode(
+                    predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                predictions = [pred.strip() for pred in predictions]
+                output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
+                with open(output_prediction_file, "w") as writer:
+                    writer.write("\n".join(predictions))
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name
+    if data_args.lang is not None:
+        kwargs["language"] = data_args.lang
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+    return results
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+if __name__ == "__main__":
+    main()

train.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+# Train BART
+python run_summarization.py \
+    --model_name_or_path "facebook/bart-base" \
+    --config_name "facebook/bart-base" \
+    --tokenizer_name ./tokenizer \
+    --do_train \
+    --do_eval \
+    --evaluation_strategy="epoch" \
+    --group_by_length \
+    --num_train_epochs=10 \
+    --train_file train.csv \
+    --validation_file test.csv \
+    --preprocessing_num_workers="20" \
+    --output_dir ./bart-kurd-spell-base/ \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=320 \
+    --per_device_eval_batch_size=256 \
+    --gradient_accumulation_steps=1 \
+    --predict_with_generate \
+    --logging_steps="100" \
+    --save_total_limit="1" \
+    --save_strategy="epoch" \
+    --report_to="wandb" \
+    --run_name="Bart Spell" \
+    --max_target_length=1024 \
+    --max_source_length=1024 \
+    --fp16 \
+    --save_safetensors \
+    --push_to_hub
+# Train T5
+# python3 run_summarization.py \
+#     --source_prefix "correct: " \
+#     --model_name_or_path "google/flan-t5-small" \
+#     --config_name "google/flan-t5-small" \
+#     --tokenizer_name ./tokenizer \
+#     --do_train \
+#     --do_eval \
+#     --evaluation_strategy="epoch" \
+#     --group_by_length \
+#     --num_train_epochs=5 \
+#     --train_file train.csv \
+#     --validation_file test.csv \
+#     --preprocessing_num_workers="12" \
+#     --output_dir ./t5-kurd-spell-base/ \
+#     --overwrite_output_dir \
+#     --per_device_train_batch_size=64 \
+#     --per_device_eval_batch_size=64 \
+#     --gradient_accumulation_steps=1 \
+#     --predict_with_generate \
+#     --logging_steps="100" \
+#     --save_total_limit="1" \
+#     --save_strategy="epoch" \
+#     --report_to="none" \
+#     --run_name="T5 Spell" \
+#     --max_target_length=1024 \
+#     --max_source_length=1024 \
+#     --push_to_hub
+#     # --fp16 \

train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--tokenizer_name", default="facebook/bart-base", help="The name of the tokenizer to train a new one from")
+parser.add_argument("--output_dir", default="tokenizer", type=str, help="Repo id the tokenizer to be pushed to")
+parser.add_argument("--push_to_hub", default=False, action="store_true", help="Push to hub",)
+args = parser.parse_args()
+dataset = load_dataset("oscar-corpus/OSCAR-2301", "ckb", split="train", token=True)
+def get_training_corpus(batch_size=1000):
+    for start_idx in range(0, len(dataset), batch_size):
+        samples = dataset[start_idx : start_idx + batch_size]
+        yield samples["text"]
+training_corpus = get_training_corpus()
+tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
+tokenizer = tokenizer.train_new_from_iterator(
+    training_corpus, vocab_size=len(tokenizer),
+    special_tokens_map={
+        "eos_token": "</s>",
+        "bos_token": "<s>",
+        "unk_token": "<unk>",
+        "pad_token": "<pad>",
+        "mask_token": "<mask>",
+    },
+)
+tokenizer.save_pretrained(args.output_dir, push_to_hub=args.push_to_hub)