Spaces:

dura-garage
/

nep-spell

Sleeping

App Files Files Community

duraad commited on Feb 23, 2024

Commit

1ddad36

0 Parent(s):

Initial Commit

Browse files

Files changed (7) hide show

.gitignore +9 -0
README.md +15 -0
requirments.txt +334 -0
src/Demo.py +66 -0
src/ModelMethods.py +166 -0
src/pages/LiteratureReview.py +8 -0
src/pages/References.py +14 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+# Folders to ignore
+model/
+model-local/
+__pycache__/
+src/__pycache__/
+# Files to ignore
+notes.md
+*.pyc

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# Nepali Spelling Correction
+## Models used
+- `google/mt5-small`
+- `facebook/mbart-large-cc25`
+- `rahular/varta-t5`
+## How to setup?
+1. Clone this repo
+2. Install the dependencies
+2. Create a folder `models` inside the repo
+3. Inside the `models` repo, `clone` the models from huggingface
+4. Update the model names in `ModelMethods.py`

requirments.txt ADDED Viewed

	@@ -0,0 +1,334 @@

+absl-py==2.0.0
+accelerate @ git+https://github.com/huggingface/accelerate.git@162a82164e9bdcc01a173cbee43b686437aaead8
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.0
+altgraph==0.17.3
+aniso8601==9.0.1
+annotated-types==0.6.0
+anyio==4.2.0
+appdirs==1.4.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asgiref==3.5.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-lru==2.0.4
+async-timeout==4.0.2
+attrs==22.1.0
+autopep8==1.6.0
+Babel==2.14.0
+backports.csv==1.0.7
+backports.entry-points-selectable==1.1.1
+beautifulsoup4==4.10.0
+bitsandbytes==0.42.0
+bleach==6.1.0
+blinker==1.5
+blis==0.7.11
+boto3==1.34.19
+botocore==1.34.19
+branca==0.7.0
+cachetools==5.2.0
+catalogue==2.0.10
+certifi==2022.6.15
+cffi==1.15.1
+chardet==4.0.0
+charset-normalizer==2.1.1
+cheroot==8.6.0
+CherryPy==18.6.1
+click==8.1.3
+cloudpathlib==0.16.0
+colorama==0.4.5
+comm==0.2.0
+commonmark==0.9.1
+confection==0.1.4
+contextualSpellCheck==0.4.4
+contourpy==1.0.6
+cryptography==38.0.1
+cycler==0.11.0
+cymem==2.0.8
+Cython==3.0.6
+datasets==2.16.1
+dateparser==1.1.0
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.7
+distlib==0.3.4
+dj-database-url==0.5.0
+Django==4.0.4
+django-ckeditor==6.2.0
+django-crispy-forms==1.14.0
+django-heroku==0.3.1
+django-js-asset==1.2.2
+docker-pycreds==0.4.0
+docopt==0.6.2
+docutils==0.20.1
+docx2pdf==0.1.8
+editdistance==0.6.2
+einops==0.7.0
+entrypoints==0.4
+et-xmlfile==1.1.0
+evaluate==0.4.0
+exceptiongroup==1.2.0
+executing==2.0.1
+ez-setup==0.9
+fastjsonschema==2.19.1
+feedparser==6.0.8
+filelock==3.4.0
+Flask==2.2.2
+Flask-API==3.0.post1
+Flask-Cors==3.0.10
+Flask-RESTful==0.3.9
+Flask-SQLAlchemy==2.5.1
+flatbuffers==23.5.26
+fonttools==4.38.0
+fqdn==1.5.1
+frozenlist==1.3.3
+fsspec==2023.10.0
+future==0.18.2
+gast==0.5.4
+gitdb==4.0.10
+GitPython==3.1.29
+google==3.0.0
+google-auth==2.25.2
+google-auth-oauthlib==1.2.0
+google-pasta==0.2.0
+GoogleNews==1.6.0
+greenlet==1.1.3
+grpcio==1.60.0
+gunicorn==20.1.0
+h5py==3.10.0
+happytransformer==3.0.0
+heroku==0.1.4
+huggingface-hub==0.20.1
+idna==3.3
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+importlib-metadata==5.1.0
+instaloader==4.9.6
+ipykernel==6.28.0
+ipyleaflet==0.18.1
+ipython==8.19.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+itsdangerous==2.1.2
+jaraco.classes==3.2.1
+jaraco.collections==3.5.1
+jaraco.context==4.1.1
+jaraco.functools==3.5.0
+jaraco.text==3.7.0
+jedi==0.19.1
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+json5==0.9.14
+jsonlines==4.0.0
+jsonpointer==2.4
+jsonschema==4.17.3
+jsonschema-specifications==2023.12.1
+jupyter-events==0.9.0
+jupyter-lsp==2.2.1
+jupyter_client==8.6.0
+jupyter_core==5.5.1
+jupyter_server==2.12.1
+jupyter_server_terminals==0.5.1
+jupyterlab==4.0.10
+jupyterlab-widgets==3.0.9
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.25.2
+jwt==1.3.1
+keras==2.15.0
+keyring==24.3.0
+kiwisolver==1.4.4
+langcodes==3.3.0
+Levenshtein==0.23.0
+libclang==16.0.6
+loralib==0.1.2
+lxml==4.9.1
+Markdown==3.5.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.1
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+mistune==3.0.2
+ml-dtypes==0.2.0
+more-itertools==8.12.0
+moviepy==1.0.3
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+multitasking==0.0.11
+murmurhash==1.0.10
+mysql-connector-python==8.0.31
+mysqlclient==2.1.0
+nbclient==0.9.0
+nbconvert==7.13.1
+nbformat==5.9.2
+nest-asyncio==1.5.8
+networkx==3.2.1
+news==1.0
+nh3==0.2.15
+nltk==3.7
+notebook_shim==0.2.3
+numpy==1.23.5
+oauthlib==3.2.2
+openai==0.27.2
+openpyxl==3.1.2
+opt-einsum==3.3.0
+overrides==7.4.0
+packaging==21.3
+pafy==0.5.5
+pandas==1.5.2
+pandocfilters==1.5.0
+parso==0.8.3
+Pattern==3.6
+pdfminer.six==20211012
+pefile==2023.2.7
+peft==0.6.0
+Pillow==9.3.0
+pipreqs==0.4.11
+pkginfo==1.9.6
+platformdirs==4.1.0
+portalocker==2.8.2
+portend==3.1.0
+preshed==3.0.9
+proglog==0.1.10
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==3.20.1
+psutil==5.9.7
+psycopg2==2.9.3
+pure-eval==0.2.2
+pyarrow==10.0.1
+pyarrow-hotfix==0.6
+pyasn1==0.4.8
+pyasn1-modules==0.3.0
+pycodestyle==2.8.0
+pycparser==2.21
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydeck==0.8.0
+Pygments==2.13.0
+pyinstaller==5.13.0
+pyinstaller-hooks-contrib==2023.6
+PyJWT==2.4.0
+Pympler==1.0.1
+PyMuPDF==1.23.12
+PyMuPDFb==1.23.9
+pyparsing==3.0.9
+PyQt5==5.15.10
+PyQt5-Qt5==5.15.2
+PyQt5-sip==12.13.0
+pyrsistent==0.19.2
+python-dateutil==2.8.2
+python-docx==0.8.11
+python-dotenv==1.0.0
+python-json-logger==2.0.7
+pytorch-pretrained-bert==0.6.2
+pytube==12.1.0
+pytz==2022.2.1
+pytz-deprecation-shim==0.1.0.post0
+pywin32==306
+pywin32-ctypes==0.2.2
+pywinpty==2.0.12
+PyYAML==6.0.1
+pyzmq==25.1.2
+rapidfuzz==3.6.1
+readme-renderer==42.0
+referencing==0.32.0
+regex==2021.11.10
+requests==2.28.1
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+responses==0.18.0
+rfc3339-validator==0.1.4
+rfc3986==2.0.0
+rfc3986-validator==0.1.1
+rich==12.6.0
+rouge-score==0.1.2
+rpds-py==0.16.2
+rsa==4.8
+s3transfer==0.10.0
+safetensors==0.4.1
+scikit-learn==1.4.0
+scipy==1.8.0
+seaborn==0.13.0
+semver==2.13.0
+Send2Trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==1.39.2
+setproctitle==1.3.3
+sgmllib3k==1.0.0
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.0
+sniffio==1.3.0
+soupsieve==2.3.1
+spacy==3.7.2
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==1.4.41
+sqlparse==0.4.2
+srsly==2.4.8
+stack-data==0.6.3
+streamlit==1.15.1
+streamlit-menu==1.0.9
+streamlit-option-menu==0.3.12
+sympy==1.12
+tempora==5.0.1
+tenacity==8.2.3
+tensorboard==2.15.1
+tensorboard-data-server==0.7.2
+tensorflow==2.15.0
+tensorflow-estimator==2.15.0
+tensorflow-intel==2.15.0
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==2.4.0
+terminado==0.18.0
+test-nep-spell-synthetic-datautils==0.1.0
+thinc==8.2.2
+threadpoolctl==3.2.0
+tinycss2==1.2.1
+tokenizers==0.15.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.0
+torch==1.13.1
+torchdata==0.5.1
+tornado==6.2
+tqdm==4.63.0
+traitlets==5.14.0
+traittypes==0.2.1
+transformers @ git+https://github.com/huggingface/transformers.git@5b5e71dc41734a9798f3535bbd5039ab91883079
+twine==5.0.0
+typer==0.9.0
+types-python-dateutil==2.8.19.14
+typing_extensions==4.4.0
+tzdata==2022.7
+tzlocal==4.2
+uri-template==1.3.0
+urllib3==1.26.12
+validators==0.20.0
+virtualenv==20.10.0
+wandb==0.16.2
+wasabi==1.1.2
+watchdog==2.1.9
+wcwidth==0.2.12
+weasel==0.3.4
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.7.0
+Werkzeug==2.2.2
+whitenoise==6.0.0
+widgetsnbextension==4.0.9
+wrapt==1.14.1
+xxhash==3.4.1
+xyzservices==2023.10.1
+yarg==0.1.9
+yarl==1.8.2
+yfinance==0.1.87
+zc.lockfile==2.0
+zipp==3.11.0

src/Demo.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import streamlit as st
+import pandas as pd
+from ModelMethods import generate
+st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
+# Make basic configuration for the app
+appTitle = "Nepali Spell Correction"
+# Some test examples here
+example = (
+    "अबको स्थायी कमिटी ओली सरकारलाई दीएको समर्थन फिर्ताको तयारि रहेको साहले जानकारी दिए।"
+)
+examples = {
+    "Examples": "",
+    "अखिलेस झा धेरै दिनदेखि अनुपस्थीत थिए ।": "अखिलेस झा धेरै दिनदेखि अनुपस्थीत थिए ।",
+    "आठौँ तह उपनिर्देषक पदमा दुई जना उत्तीर्ण भएका छन्।": "आठौँ तह उपनिर्देषक पदमा दुई जना उत्तीर्ण भएका छन्।",
+    "उनीहरूमा रोगसँग लड्ने क्षमता मज्जाले बिकसित भइसकेको हुँदैन।": "उनीहरूमा रोगसँग लड्ने क्षमता मज्जाले बिकसित भइसकेको हुँदैन।",
+}
+def main():
+    st.header(appTitle)
+    left_column, right_column = st.columns(2)
+    correctedText= None
+    with left_column:
+        model_options = {"mT5", "mBART", "VartaT5"}
+        # Display the radio options in a single line
+        selected_model = st.radio("Select the model", model_options, index=0)
+        # Create a dropdown menu
+        selected_example_key = st.selectbox("Select an example", list(examples.keys()))
+        # Display the selected example text in a text area
+        selected_example_text = examples[selected_example_key]
+        # Get user input
+        user_input = st.text_area(
+            "Enter a Nepali Sentence: ",
+            selected_example_text,
+            max_chars=512,  # Set the maximum input length to 512 characters
+        )
+        if st.button("Check Spelling"):
+            if user_input:
+                correctedText = generate(selected_model, user_input)
+                # # Perfrom grammer correction
+                # st.subheader("Corrected Text:")
+                # st.write([f"{line['score']:.2f}: {line['sequence']}"for line in correctedText])
+            else:
+                st.warning("Please enter some text to check.")
+    with right_column:
+        if correctedText is not None:
+            st.write("Corrected Text:")
+            # st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
+            df = pd.DataFrame(correctedText, columns=["score","sequence"])
+            st.table(df)
+if __name__ == "__main__":
+    main()

src/ModelMethods.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# import HappyTextToText from Happy Transformer
+from happytransformer import HappyTextToText, TTSettings
+# Huggingface Transformers
+from transformers import (
+    MT5ForConditionalGeneration,
+    MT5Tokenizer,
+    MBartForConditionalGeneration,
+    MBartTokenizer,
+    T5ForConditionalGeneration,
+    T5TokenizerFast,
+    GenerationConfig,
+)
+import torch
+import re
+"""
+    Some global variables
+    Add path to the models here
+"""
+mt5ModelPath = "../models/nep-spell-hft-23epochs"
+mbartModelPath = "../models/happytt_mBART_plus_10"
+vartat5ModelPath = "../models/vartat5-using-100K-plus-1"
+"""
+    Function: generate
+    This function takes a model name and input text as parameters and
+    returns the output text generated by the specified model.
+    It supports multiple models such as mT5, mBART, and VartaT5.
+    If the specified model is not available,
+    it returns a message indicating the unavailability of the model.
+    Parameters:
+    - model (str): Name of the model to use for text generation.
+    - input (str): Input text for the model to generate output from.
+    Returns:
+    - str: Output text generated by the specified model or a message indicating model unavailability.
+"""
+def generate(model, input):
+    if model == "mT5":
+        return mt5Inference(input)
+    elif model == "mBART":
+        return mbartInference(input)
+    elif model == "VartaT5":
+        return vartat5Inference(input)
+    else:
+        return f"Model: {model} not available"
+    # काकाले काकिलाइ माया गर्नू हुन्छ।
+"""
+    Below are the 3 different models for inference
+"""
+def mt5Inference(input):
+    print("Processing mt5")
+    model = MT5ForConditionalGeneration.from_pretrained(mt5ModelPath)
+    tokenizer = MT5Tokenizer.from_pretrained(mt5ModelPath)
+    input_ids = tokenizer("grammar: " + input, return_tensors="pt").input_ids
+    outputs = model.generate(
+        input_ids=input_ids,
+        max_length=512,
+        num_beams=5,
+        num_return_sequences=5,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
+    return postProcessOutput(sequences,outputs["sequences_scores"])
+def mbartInference(input):
+    print("Processing mbart")
+    tokenizer = MBartTokenizer.from_pretrained(
+        mbartModelPath, src_lang="ne_NP", tgt_lang="ne_NP"
+    )
+    model = MBartForConditionalGeneration.from_pretrained(mbartModelPath)
+    inputs = tokenizer("grammar: " + input, return_tensors="pt")
+    outputs = model.generate(
+        **inputs,
+        decoder_start_token_id=tokenizer.lang_code_to_id["ne_NP"],
+        max_length=512,
+        num_beams=5,
+        num_return_sequences=5,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
+    return postProcessOutput(sequences, outputs["sequences_scores"])
+    # return outputs
+def vartat5Inference(input):
+    print("Processing varta")
+    model = T5ForConditionalGeneration.from_pretrained(vartat5ModelPath)
+    # return "model ready"
+    tokenizer = T5TokenizerFast.from_pretrained(vartat5ModelPath)
+    input_ids = tokenizer("grammar: " + input, return_tensors="pt")
+    outputs = model.generate(
+        **input_ids,
+        max_length=512,
+        num_beams=5,
+        num_return_sequences=5,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    sequences = tokenizer.batch_decode(outputs["sequences"], skip_special_tokens=True)
+    return postProcessOutput(sequences,outputs["sequences_scores"])
+"""
+    Post processing the model output
+"""
+def postProcessOutput(sequences, sequences_scores):
+    probabilities = torch.exp(sequences_scores)
+    unique_sequences = set()
+    # Initialize the list to store filtered items
+    filtered_outputs = []
+    # Iterate through sequences and formatted_scores
+    for sequence, score in zip(sequences, probabilities):
+        # Check if the sequence is not in the set of unique sequences
+        if sequence not in unique_sequences:
+            # Add the sequence to the set of unique sequences
+            unique_sequences.add(sequence)
+            # Append the sequence and score to the filtered_outputs list
+            filtered_outputs.append({"sequence": sequence, "score": score.item()})
+    return filtered_outputs
+"""
+    For working with paragraph processing
+"""
+def split_nepali_paragraph_into_sentences(nepali_text):
+    # Define a regex pattern to split sentences
+    # We'll split on periods, question marks, and exclamation marks
+    sentence_pattern = r"(?<=[।?!\n])\s+"
+    # Split the Nepali text into sentences
+    sentences = re.split(sentence_pattern, nepali_text)
+    return sentences
+def process_paragraph(model, paragraph):
+    sentenceList = split_nepali_paragraph_into_sentences(paragraph)
+    out_sentence = []
+    for s in sentenceList:
+        out_sentence.append(generate(model, s))
+    nepali_paragraph = " ".join(out_sentence)
+    return nepali_paragraph

src/pages/LiteratureReview.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Literature Review",
+    page_icon="👋",
+)
+st.write("LiteratureReview")

src/pages/References.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+st.set_page_config(
+    page_title="References",
+    page_icon="👋",
+    layout="wide"
+)
+st.sidebar.header("Plotting Demo")
+st.write("References Here")