Spaces:

dura-garage
/

nep-spell

Sleeping

App Files Files Community

duraad commited on Feb 27

Commit

a28390b

•

1 Parent(s): 1ddad36

Dataset pages added

Browse files

Files changed (9) hide show

.gitignore +2 -1
fonts/TiroDevanagariHindi-Regular.ttf +0 -0
requirments.txt +5 -334
src/Demo.py +4 -8
src/{ModelMethods.py → api/ModelMethods.py} +0 -0
src/pages/LiteratureReview.py +0 -8
src/pages/References.py +0 -14
src/pages/📈DatasetAnalysis.py +540 -0
src/pages/📊DatasetsPreparation.py +59 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 # Folders to ignore
-model/
 model-local/
 __pycache__/
 src/__pycache__/

 # Folders to ignore
+models/
+datafiles/
 model-local/
 __pycache__/
 src/__pycache__/

fonts/TiroDevanagariHindi-Regular.ttf ADDED Viewed

Binary file (415 kB). View file

requirments.txt CHANGED Viewed

@@ -1,334 +1,5 @@
-absl-py==2.0.0
-accelerate @ git+https://github.com/huggingface/accelerate.git@162a82164e9bdcc01a173cbee43b686437aaead8
-aiohttp==3.8.4
-aiosignal==1.3.1
-altair==4.2.0
-altgraph==0.17.3
-aniso8601==9.0.1
-annotated-types==0.6.0
-anyio==4.2.0
-appdirs==1.4.4
-argon2-cffi==23.1.0
-argon2-cffi-bindings==21.2.0
-arrow==1.3.0
-asgiref==3.5.0
-asttokens==2.4.1
-astunparse==1.6.3
-async-lru==2.0.4
-async-timeout==4.0.2
-attrs==22.1.0
-autopep8==1.6.0
-Babel==2.14.0
-backports.csv==1.0.7
-backports.entry-points-selectable==1.1.1
-beautifulsoup4==4.10.0
-bitsandbytes==0.42.0
-bleach==6.1.0
-blinker==1.5
-blis==0.7.11
-boto3==1.34.19
-botocore==1.34.19
-branca==0.7.0
-cachetools==5.2.0
-catalogue==2.0.10
-certifi==2022.6.15
-cffi==1.15.1
-chardet==4.0.0
-charset-normalizer==2.1.1
-cheroot==8.6.0
-CherryPy==18.6.1
-click==8.1.3
-cloudpathlib==0.16.0
-colorama==0.4.5
-comm==0.2.0
-commonmark==0.9.1
-confection==0.1.4
-contextualSpellCheck==0.4.4
-contourpy==1.0.6
-cryptography==38.0.1
-cycler==0.11.0
-cymem==2.0.8
-Cython==3.0.6
-datasets==2.16.1
-dateparser==1.1.0
-debugpy==1.8.0
-decorator==5.1.1
-defusedxml==0.7.1
-dill==0.3.7
-distlib==0.3.4
-dj-database-url==0.5.0
-Django==4.0.4
-django-ckeditor==6.2.0
-django-crispy-forms==1.14.0
-django-heroku==0.3.1
-django-js-asset==1.2.2
-docker-pycreds==0.4.0
-docopt==0.6.2
-docutils==0.20.1
-docx2pdf==0.1.8
-editdistance==0.6.2
-einops==0.7.0
-entrypoints==0.4
-et-xmlfile==1.1.0
-evaluate==0.4.0
-exceptiongroup==1.2.0
-executing==2.0.1
-ez-setup==0.9
-fastjsonschema==2.19.1
-feedparser==6.0.8
-filelock==3.4.0
-Flask==2.2.2
-Flask-API==3.0.post1
-Flask-Cors==3.0.10
-Flask-RESTful==0.3.9
-Flask-SQLAlchemy==2.5.1
-flatbuffers==23.5.26
-fonttools==4.38.0
-fqdn==1.5.1
-frozenlist==1.3.3
-fsspec==2023.10.0
-future==0.18.2
-gast==0.5.4
-gitdb==4.0.10
-GitPython==3.1.29
-google==3.0.0
-google-auth==2.25.2
-google-auth-oauthlib==1.2.0
-google-pasta==0.2.0
-GoogleNews==1.6.0
-greenlet==1.1.3
-grpcio==1.60.0
-gunicorn==20.1.0
-h5py==3.10.0
-happytransformer==3.0.0
-heroku==0.1.4
-huggingface-hub==0.20.1
-idna==3.3
-imageio==2.19.3
-imageio-ffmpeg==0.4.7
-importlib-metadata==5.1.0
-instaloader==4.9.6
-ipykernel==6.28.0
-ipyleaflet==0.18.1
-ipython==8.19.0
-ipywidgets==8.1.1
-isoduration==20.11.0
-itsdangerous==2.1.2
-jaraco.classes==3.2.1
-jaraco.collections==3.5.1
-jaraco.context==4.1.1
-jaraco.functools==3.5.0
-jaraco.text==3.7.0
-jedi==0.19.1
-Jinja2==3.1.2
-jmespath==1.0.1
-joblib==1.3.2
-json5==0.9.14
-jsonlines==4.0.0
-jsonpointer==2.4
-jsonschema==4.17.3
-jsonschema-specifications==2023.12.1
-jupyter-events==0.9.0
-jupyter-lsp==2.2.1
-jupyter_client==8.6.0
-jupyter_core==5.5.1
-jupyter_server==2.12.1
-jupyter_server_terminals==0.5.1
-jupyterlab==4.0.10
-jupyterlab-widgets==3.0.9
-jupyterlab_pygments==0.3.0
-jupyterlab_server==2.25.2
-jwt==1.3.1
-keras==2.15.0
-keyring==24.3.0
-kiwisolver==1.4.4
-langcodes==3.3.0
-Levenshtein==0.23.0
-libclang==16.0.6
-loralib==0.1.2
-lxml==4.9.1
-Markdown==3.5.1
-markdown-it-py==3.0.0
-MarkupSafe==2.1.1
-matplotlib==3.7.1
-matplotlib-inline==0.1.6
-mdurl==0.1.2
-mistune==3.0.2
-ml-dtypes==0.2.0
-more-itertools==8.12.0
-moviepy==1.0.3
-mpmath==1.3.0
-multidict==6.0.4
-multiprocess==0.70.15
-multitasking==0.0.11
-murmurhash==1.0.10
-mysql-connector-python==8.0.31
-mysqlclient==2.1.0
-nbclient==0.9.0
-nbconvert==7.13.1
-nbformat==5.9.2
-nest-asyncio==1.5.8
-networkx==3.2.1
-news==1.0
-nh3==0.2.15
-nltk==3.7
-notebook_shim==0.2.3
-numpy==1.23.5
-oauthlib==3.2.2
-openai==0.27.2
-openpyxl==3.1.2
-opt-einsum==3.3.0
-overrides==7.4.0
-packaging==21.3
-pafy==0.5.5
-pandas==1.5.2
-pandocfilters==1.5.0
-parso==0.8.3
-Pattern==3.6
-pdfminer.six==20211012
-pefile==2023.2.7
-peft==0.6.0
-Pillow==9.3.0
-pipreqs==0.4.11
-pkginfo==1.9.6
-platformdirs==4.1.0
-portalocker==2.8.2
-portend==3.1.0
-preshed==3.0.9
-proglog==0.1.10
-prometheus-client==0.19.0
-prompt-toolkit==3.0.43
-protobuf==3.20.1
-psutil==5.9.7
-psycopg2==2.9.3
-pure-eval==0.2.2
-pyarrow==10.0.1
-pyarrow-hotfix==0.6
-pyasn1==0.4.8
-pyasn1-modules==0.3.0
-pycodestyle==2.8.0
-pycparser==2.21
-pydantic==2.5.3
-pydantic_core==2.14.6
-pydeck==0.8.0
-Pygments==2.13.0
-pyinstaller==5.13.0
-pyinstaller-hooks-contrib==2023.6
-PyJWT==2.4.0
-Pympler==1.0.1
-PyMuPDF==1.23.12
-PyMuPDFb==1.23.9
-pyparsing==3.0.9
-PyQt5==5.15.10
-PyQt5-Qt5==5.15.2
-PyQt5-sip==12.13.0
-pyrsistent==0.19.2
-python-dateutil==2.8.2
-python-docx==0.8.11
-python-dotenv==1.0.0
-python-json-logger==2.0.7
-pytorch-pretrained-bert==0.6.2
-pytube==12.1.0
-pytz==2022.2.1
-pytz-deprecation-shim==0.1.0.post0
-pywin32==306
-pywin32-ctypes==0.2.2
-pywinpty==2.0.12
-PyYAML==6.0.1
-pyzmq==25.1.2
-rapidfuzz==3.6.1
-readme-renderer==42.0
-referencing==0.32.0
-regex==2021.11.10
-requests==2.28.1
-requests-oauthlib==1.3.1
-requests-toolbelt==1.0.0
-responses==0.18.0
-rfc3339-validator==0.1.4
-rfc3986==2.0.0
-rfc3986-validator==0.1.1
-rich==12.6.0
-rouge-score==0.1.2
-rpds-py==0.16.2
-rsa==4.8
-s3transfer==0.10.0
-safetensors==0.4.1
-scikit-learn==1.4.0
-scipy==1.8.0
-seaborn==0.13.0
-semver==2.13.0
-Send2Trash==1.8.2
-sentencepiece==0.1.99
-sentry-sdk==1.39.2
-setproctitle==1.3.3
-sgmllib3k==1.0.0
-six==1.16.0
-smart-open==6.4.0
-smmap==5.0.0
-sniffio==1.3.0
-soupsieve==2.3.1
-spacy==3.7.2
-spacy-legacy==3.0.12
-spacy-loggers==1.0.5
-SQLAlchemy==1.4.41
-sqlparse==0.4.2
-srsly==2.4.8
-stack-data==0.6.3
-streamlit==1.15.1
-streamlit-menu==1.0.9
-streamlit-option-menu==0.3.12
-sympy==1.12
-tempora==5.0.1
-tenacity==8.2.3
-tensorboard==2.15.1
-tensorboard-data-server==0.7.2
-tensorflow==2.15.0
-tensorflow-estimator==2.15.0
-tensorflow-intel==2.15.0
-tensorflow-io-gcs-filesystem==0.31.0
-termcolor==2.4.0
-terminado==0.18.0
-test-nep-spell-synthetic-datautils==0.1.0
-thinc==8.2.2
-threadpoolctl==3.2.0
-tinycss2==1.2.1
-tokenizers==0.15.1
-toml==0.10.2
-tomli==2.0.1
-toolz==0.12.0
-torch==1.13.1
-torchdata==0.5.1
-tornado==6.2
-tqdm==4.63.0
-traitlets==5.14.0
-traittypes==0.2.1
-transformers @ git+https://github.com/huggingface/transformers.git@5b5e71dc41734a9798f3535bbd5039ab91883079
-twine==5.0.0
-typer==0.9.0
-types-python-dateutil==2.8.19.14
-typing_extensions==4.4.0
-tzdata==2022.7
-tzlocal==4.2
-uri-template==1.3.0
-urllib3==1.26.12
-validators==0.20.0
-virtualenv==20.10.0
-wandb==0.16.2
-wasabi==1.1.2
-watchdog==2.1.9
-wcwidth==0.2.12
-weasel==0.3.4
-webcolors==1.13
-webencodings==0.5.1
-websocket-client==1.7.0
-Werkzeug==2.2.2
-whitenoise==6.0.0
-widgetsnbextension==4.0.9
-wrapt==1.14.1
-xxhash==3.4.1
-xyzservices==2023.10.1
-yarg==0.1.9
-yarl==1.8.2
-yfinance==0.1.87
-zc.lockfile==2.0
-zipp==3.11.0

+transformers
+streamlit
+wordcloud
+matplotlib
+pandas

src/Demo.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 import pandas as pd
-from ModelMethods import generate
 st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
@@ -27,7 +27,7 @@ def main():
     st.header(appTitle)
     left_column, right_column = st.columns(2)
-    correctedText= None
     with left_column:
         model_options = {"mT5", "mBART", "VartaT5"}
@@ -41,11 +41,7 @@ def main():
         selected_example_text = examples[selected_example_key]
         # Get user input
-        user_input = st.text_area(
-            "Enter a Nepali Sentence: ",
-            selected_example_text,
-            max_chars=512,  # Set the maximum input length to 512 characters
-        )
         if st.button("Check Spelling"):
             if user_input:
                 correctedText = generate(selected_model, user_input)
@@ -58,7 +54,7 @@ def main():
         if correctedText is not None:
             st.write("Corrected Text:")
             # st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
-            df = pd.DataFrame(correctedText, columns=["score","sequence"])
             st.table(df)

 import streamlit as st
 import pandas as pd
+from api.ModelMethods import generate
 st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
     st.header(appTitle)
     left_column, right_column = st.columns(2)
+    correctedText = None
     with left_column:
         model_options = {"mT5", "mBART", "VartaT5"}
         selected_example_text = examples[selected_example_key]
         # Get user input
+        user_input = st.text_area("Enter a Nepali Sentence: ", selected_example_text)
         if st.button("Check Spelling"):
             if user_input:
                 correctedText = generate(selected_model, user_input)
         if correctedText is not None:
             st.write("Corrected Text:")
             # st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
+            df = pd.DataFrame(correctedText, columns=["score", "sequence"])
             st.table(df)

src/{ModelMethods.py → api/ModelMethods.py} RENAMED Viewed

File without changes

src/pages/LiteratureReview.py DELETED Viewed

@@ -1,8 +0,0 @@
-import streamlit as st
-st.set_page_config(
-    page_title="Literature Review",
-    page_icon="👋",
-)
-st.write("LiteratureReview")

src/pages/References.py DELETED Viewed

@@ -1,14 +0,0 @@
-import streamlit as st
-st.set_page_config(
-    page_title="References",
-    page_icon="👋",
-    layout="wide"
-)
-st.sidebar.header("Plotting Demo")
-st.write("References Here")

src/pages/📈DatasetAnalysis.py ADDED Viewed

	@@ -0,0 +1,540 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+from collections import Counter
+from wordcloud import WordCloud
+from matplotlib.font_manager import FontProperties
+st.set_page_config(page_title="Datasets Analysis", page_icon="👋", layout="wide")
+data100k = "../datafiles/nep_spell_100k.csv"
+# Preparing datafrmae
+df = pd.read_csv(data100k)
+# Count words
+df["num_words"] = df["Correct"].apply(lambda x: len(x.split()))
+# Count the number of sentences for each number of words
+word_counts = df["num_words"].value_counts().sort_index()
+# Create a Streamlit app
+st.title("Dataset Analysis")
+st.subheader("Word Count Analysis")
+# Display the DataFrame (optional)
+st.write(df)
+st.write("---")
+# Plot the data
+plt.figure(figsize=(10, 6))
+plt.bar(word_counts.index, word_counts.values, color="skyblue")
+plt.xlabel("Number of Words in Sentence")
+plt.ylabel("Number of Sentences")
+plt.title("Number of Words vs. Number of Sentences")
+plt.grid(True)
+# Set the range in the x-axis to 70
+plt.xlim(0, 70)
+# Save the plot as an image file (optional)
+# plt.savefig("word_count_plot.png", dpi=300)
+# Display the plot in Streamlit
+st.pyplot(plt)
+st.write("---")
+#########################
+# Concatenate all sentences into a single string
+all_sentences = " ".join(df["Correct"])
+# Tokenize the sentences and calculate word frequency
+words = all_sentences.split()
+word_freq = Counter(words)
+# Consider the top 1000 most common words
+top_words = word_freq.most_common(1000)
+# Generate the corpus for word cloud
+corpus = {}
+for word, frequency in top_words:
+    corpus[word] = frequency
+# Define the font file path
+font1 = "../fonts/TiroDevanagariHindi-Regular.ttf"
+# Generate the word cloud
+wordcloud_most_common = WordCloud(
+    width=1000,
+    height=500,
+    background_color="white",
+    min_font_size=10,
+    regexp=r"[\u0900-\u097F]+",
+    font_path=font1,
+).generate_from_frequencies(corpus)
+# Display the word cloud using Streamlit
+st.subheader("Word Cloud of Most Frequent Words in Correct Sentences")
+st.image(wordcloud_most_common.to_array(), use_column_width=True)
+############################################
+# WOrd cloud of least common
+st.write("---")
+# Concatenate all sentences into a single string
+# Consider the least 1000 frequent words
+least_common_words = word_freq.most_common()[: -1000 - 1 : -1]
+# Generate the corpus for word cloud
+corpus = {}
+for word, frequency in least_common_words:
+    corpus[word] = frequency
+# Generate the word cloud for least frequent words
+wordcloud_least_frequent = WordCloud(
+    width=1000,
+    height=500,
+    background_color="white",
+    min_font_size=10,
+    regexp=r"[\u0900-\u097F]+",
+    font_path=font1,
+).generate_from_frequencies(corpus)
+# Display the word cloud using Streamlit
+st.header("Word Cloud of Least Frequent Words in Correct Sentences")
+st.image(wordcloud_least_frequent.to_array(), use_column_width=True)
+########################################
+st.write("---")
+# Data
+char_seq_in = [
+    "ि",
+    "ी",
+    "ु",
+    "ू",
+    "इ",
+    "ई",
+    "उ",
+    "ऊ",
+    "श",
+    "श",
+    "स",
+    "स",
+    "ष",
+    "ष",
+    "ब",
+    "व",
+    "त",
+    "ट",
+    "द",
+    "ध",
+    "ं",
+    "ँ",
+]
+char_seq_out = [
+    "ी",
+    "ि",
+    "ू",
+    "ु",
+    "ई",
+    "इ",
+    "ऊ",
+    "उ",
+    "स",
+    "ष",
+    "श",
+    "ष",
+    "श",
+    "स",
+    "व",
+    "ब",
+    "ट",
+    "त",
+    "ध",
+    "द",
+    "ँ",
+    "ं",
+]
+datapoints_in_percentage = [
+    5,
+    5,
+    5,
+    5,
+    2.5,
+    2.5,
+    2.5,
+    2.5,
+    1.5,
+    0.5,
+    1.5,
+    0.5,
+    0.5,
+    0.5,
+    1,
+    1,
+    1,
+    0.6,
+    0.5,
+    0.5,
+    1,
+    1,
+]
+# Plot
+plt.figure(figsize=(10, 6))
+plt.bar(char_seq_in, datapoints_in_percentage, color="skyblue")
+plt.xlabel("Character Sequence (Input)")
+plt.ylabel("Percentage of Datapoints")
+plt.title("Distribution of Character Substitution Errors")
+# Specify font properties
+font_prop = FontProperties(fname=font1)
+plt.xticks(char_seq_in, char_seq_in, fontproperties=font_prop)
+plt.grid(axis="y")
+# Save the image
+# plt.savefig("character_substitution.png", dpi=300, bbox_inches="tight")
+# Show plot
+plt.tight_layout()
+# Display the plot in Streamlit
+st.subheader("Character substitution error")
+st.pyplot(plt)
+##################################
+st.write("---")
+# Existing data
+characters = [
+    " ",
+    "ा",
+    "ि",
+    "ी",
+    "ु",
+    "ू",
+    "े",
+    "ै",
+    "ो",
+    "ौ",
+    "ृ",
+    "्",
+    "ः",
+    "क",
+    "ख",
+    "ग",
+    "घ",
+    "ङ",
+    "च",
+    "छ",
+    "ज",
+    "झ",
+    "ञ",
+    "ट",
+    "ठ",
+    "ड",
+    "ढ",
+    "ण",
+    "त",
+    "थ",
+    "द",
+    "ध",
+    "न",
+    "प",
+    "फ",
+    "ब",
+    "भ",
+    "म",
+    "य",
+    "र",
+    "ल",
+    "व",
+    "श",
+    "स",
+    "ष",
+    "ह",
+    "अ",
+    "आ",
+    "इ",
+    "ई",
+    "उ",
+    "ऊ",
+    "ऋ",
+    "ए",
+    "ऐ",
+    "ओ",
+    "औ",
+]
+datapoints_in_percentage = [
+    1.5,
+    1.5,
+    1.5,
+    1.5,
+    1.5,
+    1.5,
+    1,
+    1,
+    1,
+    1,
+    1.2,
+    1,
+    0.5,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+]
+# Additional data
+additional_characters = ["क्ष", "त्र", "ज्ञ", "अं", "अः"]
+additional_datapoints_in_percentage = [0.15, 0.15, 0.15, 0.15, 0.15]
+# Combine the existing and additional data
+characters += additional_characters
+datapoints_in_percentage += additional_datapoints_in_percentage
+# Plot
+plt.figure(figsize=(12, 6))
+plt.bar(characters, datapoints_in_percentage, color="skyblue")
+plt.xlabel("Character")
+plt.ylabel("Percentage of Datapoints")
+plt.title("Distribution of Character Additions Errors")
+plt.xticks(rotation=90)
+# Specify font properties
+font_prop = FontProperties(fname=font1)
+plt.xticks(characters, characters, fontproperties=font_prop)
+plt.grid(axis="y")
+# Save the image
+# plt.savefig("character_addition.png", dpi=300, bbox_inches="tight")
+# Show plot
+plt.tight_layout()
+st.subheader("Character Addition Error")
+st.pyplot(plt)
+############################################################
+st.write("---")
+# Data
+characters = [
+    " ",
+    "ा",
+    "ि",
+    "ी",
+    "ु",
+    "ू",
+    "े",
+    "ै",
+    "ो",
+    "ौ",
+    "ृ",
+    "्",
+    "ः",
+    "क",
+    "ख",
+    "ग",
+    "घ",
+    "ङ",
+    "च",
+    "छ",
+    "ज",
+    "झ",
+    "ञ",
+    "ट",
+    "ठ",
+    "ड",
+    "ढ",
+    "ण",
+    "त",
+    "थ",
+    "द",
+    "ध",
+    "न",
+    "प",
+    "फ",
+    "ब",
+    "भ",
+    "म",
+    "य",
+    "र",
+    "ल",
+    "व",
+    "श",
+    "स",
+    "ष",
+    "ह",
+    "अ",
+    "आ",
+    "इ",
+    "ई",
+    "उ",
+    "ऊ",
+    "ऋ",
+    "ए",
+    "ऐ",
+    "ओ",
+    "औ",
+    "क्ष",
+    "त्र",
+    "ज्ञ",
+    "अं",
+    "अः",
+]
+datapoints_in_percentage = [
+    1.5,
+    1.5,
+    1.5,
+    1.5,
+    1.5,
+    1.5,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1.25,
+    0.5,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.25,
+    0.15,
+    0.15,
+    0.15,
+    0.15,
+    0.15,
+]
+# Plot
+plt.figure(figsize=(10, 6))
+plt.bar(characters, datapoints_in_percentage, color="skyblue")
+plt.xlabel("Character")
+plt.ylabel("Percentage of Datapoints")
+plt.title("Distribution of Character Deletion Errors")
+plt.xticks(rotation=90)
+# Specify font properties
+font_prop = FontProperties(fname=font1)
+plt.xticks(characters, characters, fontproperties=font_prop)
+plt.grid(axis="y")
+# Save the image
+# plt.savefig("character_deletion.png", dpi=300, bbox_inches="tight")
+# Show plot
+plt.tight_layout()
+st.subheader("Character Deletion Error")
+st.pyplot(plt)
+############################################
+st.write("---")
+# Data
+error_types = ["Deletion", "Addition", "Substitution", "Double Substitution"]
+error_percentages = [28.5, 28.45, 40.1, 2.95]
+# Create horizontal bar graph
+plt.figure(figsize=(10, 6))
+plt.barh(error_types, error_percentages)
+# Add labels and title
+plt.xlabel("Error Percentage")
+plt.ylabel("Error Type")
+plt.title("Error Types Distribution")
+# Save the image
+# plt.savefig("error_type_distribution.png", dpi=300, bbox_inches="tight")
+# Show plot
+st.subheader("Distribution of Error Types")
+st.pyplot(plt)

src/pages/📊DatasetsPreparation.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+import pandas as pd
+st.set_page_config(page_title="Datasets Preparation", page_icon="👋", layout="wide")
+##########################################
+# Read lines from the text file
+with open("../datafiles/sample_nep_corpus.txt") as file:
+    items = file.readlines()
+# Split each line into separate columns
+datacorpus = pd.DataFrame(items, columns=["Content"])
+# datacorpus.columns =["Content"]
+# st.write(f"{datacorpus}")
+datasentences = pd.read_csv("../datafiles/sample_nep_sentences.csv")
+data100k = pd.read_csv(
+    r"../datafiles/sample_nep_spell_100k.csv",
+    nrows=50,
+)
+###########################################
+st.title("Dataset Preparation")
+st.write("---")
+st.header(
+    """
+A Large Nepali Text Corpus
+"""
+)
+st.caption("**Table 1.** A Large Nepali Text Corpus")
+st.dataframe(datacorpus, use_container_width=True)
+st.write("---")
+st.header(
+    """
+Sentence extrancted from A Large Nepali Text Corpus
+"""
+)
+st.caption("**Table 2.** Extracted sentences")
+st.dataframe(datasentences, use_container_width=True)
+st.write("---")
+st.header(
+    """
+Parallel dataset using extracted sentences
+"""
+)
+st.caption("**Table 3.** 100k Dataset used for training")
+st.dataframe(data100k, use_container_width=True)