duraad commited on
Commit
a28390b
1 Parent(s): 1ddad36

Dataset pages added

Browse files
.gitignore CHANGED
@@ -1,5 +1,6 @@
1
  # Folders to ignore
2
- model/
 
3
  model-local/
4
  __pycache__/
5
  src/__pycache__/
 
1
  # Folders to ignore
2
+ models/
3
+ datafiles/
4
  model-local/
5
  __pycache__/
6
  src/__pycache__/
fonts/TiroDevanagariHindi-Regular.ttf ADDED
Binary file (415 kB). View file
 
requirments.txt CHANGED
@@ -1,334 +1,5 @@
1
- absl-py==2.0.0
2
- accelerate @ git+https://github.com/huggingface/accelerate.git@162a82164e9bdcc01a173cbee43b686437aaead8
3
- aiohttp==3.8.4
4
- aiosignal==1.3.1
5
- altair==4.2.0
6
- altgraph==0.17.3
7
- aniso8601==9.0.1
8
- annotated-types==0.6.0
9
- anyio==4.2.0
10
- appdirs==1.4.4
11
- argon2-cffi==23.1.0
12
- argon2-cffi-bindings==21.2.0
13
- arrow==1.3.0
14
- asgiref==3.5.0
15
- asttokens==2.4.1
16
- astunparse==1.6.3
17
- async-lru==2.0.4
18
- async-timeout==4.0.2
19
- attrs==22.1.0
20
- autopep8==1.6.0
21
- Babel==2.14.0
22
- backports.csv==1.0.7
23
- backports.entry-points-selectable==1.1.1
24
- beautifulsoup4==4.10.0
25
- bitsandbytes==0.42.0
26
- bleach==6.1.0
27
- blinker==1.5
28
- blis==0.7.11
29
- boto3==1.34.19
30
- botocore==1.34.19
31
- branca==0.7.0
32
- cachetools==5.2.0
33
- catalogue==2.0.10
34
- certifi==2022.6.15
35
- cffi==1.15.1
36
- chardet==4.0.0
37
- charset-normalizer==2.1.1
38
- cheroot==8.6.0
39
- CherryPy==18.6.1
40
- click==8.1.3
41
- cloudpathlib==0.16.0
42
- colorama==0.4.5
43
- comm==0.2.0
44
- commonmark==0.9.1
45
- confection==0.1.4
46
- contextualSpellCheck==0.4.4
47
- contourpy==1.0.6
48
- cryptography==38.0.1
49
- cycler==0.11.0
50
- cymem==2.0.8
51
- Cython==3.0.6
52
- datasets==2.16.1
53
- dateparser==1.1.0
54
- debugpy==1.8.0
55
- decorator==5.1.1
56
- defusedxml==0.7.1
57
- dill==0.3.7
58
- distlib==0.3.4
59
- dj-database-url==0.5.0
60
- Django==4.0.4
61
- django-ckeditor==6.2.0
62
- django-crispy-forms==1.14.0
63
- django-heroku==0.3.1
64
- django-js-asset==1.2.2
65
- docker-pycreds==0.4.0
66
- docopt==0.6.2
67
- docutils==0.20.1
68
- docx2pdf==0.1.8
69
- editdistance==0.6.2
70
- einops==0.7.0
71
- entrypoints==0.4
72
- et-xmlfile==1.1.0
73
- evaluate==0.4.0
74
- exceptiongroup==1.2.0
75
- executing==2.0.1
76
- ez-setup==0.9
77
- fastjsonschema==2.19.1
78
- feedparser==6.0.8
79
- filelock==3.4.0
80
- Flask==2.2.2
81
- Flask-API==3.0.post1
82
- Flask-Cors==3.0.10
83
- Flask-RESTful==0.3.9
84
- Flask-SQLAlchemy==2.5.1
85
- flatbuffers==23.5.26
86
- fonttools==4.38.0
87
- fqdn==1.5.1
88
- frozenlist==1.3.3
89
- fsspec==2023.10.0
90
- future==0.18.2
91
- gast==0.5.4
92
- gitdb==4.0.10
93
- GitPython==3.1.29
94
- google==3.0.0
95
- google-auth==2.25.2
96
- google-auth-oauthlib==1.2.0
97
- google-pasta==0.2.0
98
- GoogleNews==1.6.0
99
- greenlet==1.1.3
100
- grpcio==1.60.0
101
- gunicorn==20.1.0
102
- h5py==3.10.0
103
- happytransformer==3.0.0
104
- heroku==0.1.4
105
- huggingface-hub==0.20.1
106
- idna==3.3
107
- imageio==2.19.3
108
- imageio-ffmpeg==0.4.7
109
- importlib-metadata==5.1.0
110
- instaloader==4.9.6
111
- ipykernel==6.28.0
112
- ipyleaflet==0.18.1
113
- ipython==8.19.0
114
- ipywidgets==8.1.1
115
- isoduration==20.11.0
116
- itsdangerous==2.1.2
117
- jaraco.classes==3.2.1
118
- jaraco.collections==3.5.1
119
- jaraco.context==4.1.1
120
- jaraco.functools==3.5.0
121
- jaraco.text==3.7.0
122
- jedi==0.19.1
123
- Jinja2==3.1.2
124
- jmespath==1.0.1
125
- joblib==1.3.2
126
- json5==0.9.14
127
- jsonlines==4.0.0
128
- jsonpointer==2.4
129
- jsonschema==4.17.3
130
- jsonschema-specifications==2023.12.1
131
- jupyter-events==0.9.0
132
- jupyter-lsp==2.2.1
133
- jupyter_client==8.6.0
134
- jupyter_core==5.5.1
135
- jupyter_server==2.12.1
136
- jupyter_server_terminals==0.5.1
137
- jupyterlab==4.0.10
138
- jupyterlab-widgets==3.0.9
139
- jupyterlab_pygments==0.3.0
140
- jupyterlab_server==2.25.2
141
- jwt==1.3.1
142
- keras==2.15.0
143
- keyring==24.3.0
144
- kiwisolver==1.4.4
145
- langcodes==3.3.0
146
- Levenshtein==0.23.0
147
- libclang==16.0.6
148
- loralib==0.1.2
149
- lxml==4.9.1
150
- Markdown==3.5.1
151
- markdown-it-py==3.0.0
152
- MarkupSafe==2.1.1
153
- matplotlib==3.7.1
154
- matplotlib-inline==0.1.6
155
- mdurl==0.1.2
156
- mistune==3.0.2
157
- ml-dtypes==0.2.0
158
- more-itertools==8.12.0
159
- moviepy==1.0.3
160
- mpmath==1.3.0
161
- multidict==6.0.4
162
- multiprocess==0.70.15
163
- multitasking==0.0.11
164
- murmurhash==1.0.10
165
- mysql-connector-python==8.0.31
166
- mysqlclient==2.1.0
167
- nbclient==0.9.0
168
- nbconvert==7.13.1
169
- nbformat==5.9.2
170
- nest-asyncio==1.5.8
171
- networkx==3.2.1
172
- news==1.0
173
- nh3==0.2.15
174
- nltk==3.7
175
- notebook_shim==0.2.3
176
- numpy==1.23.5
177
- oauthlib==3.2.2
178
- openai==0.27.2
179
- openpyxl==3.1.2
180
- opt-einsum==3.3.0
181
- overrides==7.4.0
182
- packaging==21.3
183
- pafy==0.5.5
184
- pandas==1.5.2
185
- pandocfilters==1.5.0
186
- parso==0.8.3
187
- Pattern==3.6
188
- pdfminer.six==20211012
189
- pefile==2023.2.7
190
- peft==0.6.0
191
- Pillow==9.3.0
192
- pipreqs==0.4.11
193
- pkginfo==1.9.6
194
- platformdirs==4.1.0
195
- portalocker==2.8.2
196
- portend==3.1.0
197
- preshed==3.0.9
198
- proglog==0.1.10
199
- prometheus-client==0.19.0
200
- prompt-toolkit==3.0.43
201
- protobuf==3.20.1
202
- psutil==5.9.7
203
- psycopg2==2.9.3
204
- pure-eval==0.2.2
205
- pyarrow==10.0.1
206
- pyarrow-hotfix==0.6
207
- pyasn1==0.4.8
208
- pyasn1-modules==0.3.0
209
- pycodestyle==2.8.0
210
- pycparser==2.21
211
- pydantic==2.5.3
212
- pydantic_core==2.14.6
213
- pydeck==0.8.0
214
- Pygments==2.13.0
215
- pyinstaller==5.13.0
216
- pyinstaller-hooks-contrib==2023.6
217
- PyJWT==2.4.0
218
- Pympler==1.0.1
219
- PyMuPDF==1.23.12
220
- PyMuPDFb==1.23.9
221
- pyparsing==3.0.9
222
- PyQt5==5.15.10
223
- PyQt5-Qt5==5.15.2
224
- PyQt5-sip==12.13.0
225
- pyrsistent==0.19.2
226
- python-dateutil==2.8.2
227
- python-docx==0.8.11
228
- python-dotenv==1.0.0
229
- python-json-logger==2.0.7
230
- pytorch-pretrained-bert==0.6.2
231
- pytube==12.1.0
232
- pytz==2022.2.1
233
- pytz-deprecation-shim==0.1.0.post0
234
- pywin32==306
235
- pywin32-ctypes==0.2.2
236
- pywinpty==2.0.12
237
- PyYAML==6.0.1
238
- pyzmq==25.1.2
239
- rapidfuzz==3.6.1
240
- readme-renderer==42.0
241
- referencing==0.32.0
242
- regex==2021.11.10
243
- requests==2.28.1
244
- requests-oauthlib==1.3.1
245
- requests-toolbelt==1.0.0
246
- responses==0.18.0
247
- rfc3339-validator==0.1.4
248
- rfc3986==2.0.0
249
- rfc3986-validator==0.1.1
250
- rich==12.6.0
251
- rouge-score==0.1.2
252
- rpds-py==0.16.2
253
- rsa==4.8
254
- s3transfer==0.10.0
255
- safetensors==0.4.1
256
- scikit-learn==1.4.0
257
- scipy==1.8.0
258
- seaborn==0.13.0
259
- semver==2.13.0
260
- Send2Trash==1.8.2
261
- sentencepiece==0.1.99
262
- sentry-sdk==1.39.2
263
- setproctitle==1.3.3
264
- sgmllib3k==1.0.0
265
- six==1.16.0
266
- smart-open==6.4.0
267
- smmap==5.0.0
268
- sniffio==1.3.0
269
- soupsieve==2.3.1
270
- spacy==3.7.2
271
- spacy-legacy==3.0.12
272
- spacy-loggers==1.0.5
273
- SQLAlchemy==1.4.41
274
- sqlparse==0.4.2
275
- srsly==2.4.8
276
- stack-data==0.6.3
277
- streamlit==1.15.1
278
- streamlit-menu==1.0.9
279
- streamlit-option-menu==0.3.12
280
- sympy==1.12
281
- tempora==5.0.1
282
- tenacity==8.2.3
283
- tensorboard==2.15.1
284
- tensorboard-data-server==0.7.2
285
- tensorflow==2.15.0
286
- tensorflow-estimator==2.15.0
287
- tensorflow-intel==2.15.0
288
- tensorflow-io-gcs-filesystem==0.31.0
289
- termcolor==2.4.0
290
- terminado==0.18.0
291
- test-nep-spell-synthetic-datautils==0.1.0
292
- thinc==8.2.2
293
- threadpoolctl==3.2.0
294
- tinycss2==1.2.1
295
- tokenizers==0.15.1
296
- toml==0.10.2
297
- tomli==2.0.1
298
- toolz==0.12.0
299
- torch==1.13.1
300
- torchdata==0.5.1
301
- tornado==6.2
302
- tqdm==4.63.0
303
- traitlets==5.14.0
304
- traittypes==0.2.1
305
- transformers @ git+https://github.com/huggingface/transformers.git@5b5e71dc41734a9798f3535bbd5039ab91883079
306
- twine==5.0.0
307
- typer==0.9.0
308
- types-python-dateutil==2.8.19.14
309
- typing_extensions==4.4.0
310
- tzdata==2022.7
311
- tzlocal==4.2
312
- uri-template==1.3.0
313
- urllib3==1.26.12
314
- validators==0.20.0
315
- virtualenv==20.10.0
316
- wandb==0.16.2
317
- wasabi==1.1.2
318
- watchdog==2.1.9
319
- wcwidth==0.2.12
320
- weasel==0.3.4
321
- webcolors==1.13
322
- webencodings==0.5.1
323
- websocket-client==1.7.0
324
- Werkzeug==2.2.2
325
- whitenoise==6.0.0
326
- widgetsnbextension==4.0.9
327
- wrapt==1.14.1
328
- xxhash==3.4.1
329
- xyzservices==2023.10.1
330
- yarg==0.1.9
331
- yarl==1.8.2
332
- yfinance==0.1.87
333
- zc.lockfile==2.0
334
- zipp==3.11.0
 
1
+ transformers
2
+ streamlit
3
+ wordcloud
4
+ matplotlib
5
+ pandas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/Demo.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
 
4
- from ModelMethods import generate
5
 
6
 
7
  st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
@@ -27,7 +27,7 @@ def main():
27
 
28
  st.header(appTitle)
29
  left_column, right_column = st.columns(2)
30
- correctedText= None
31
 
32
  with left_column:
33
  model_options = {"mT5", "mBART", "VartaT5"}
@@ -41,11 +41,7 @@ def main():
41
  selected_example_text = examples[selected_example_key]
42
 
43
  # Get user input
44
- user_input = st.text_area(
45
- "Enter a Nepali Sentence: ",
46
- selected_example_text,
47
- max_chars=512, # Set the maximum input length to 512 characters
48
- )
49
  if st.button("Check Spelling"):
50
  if user_input:
51
  correctedText = generate(selected_model, user_input)
@@ -58,7 +54,7 @@ def main():
58
  if correctedText is not None:
59
  st.write("Corrected Text:")
60
  # st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
61
- df = pd.DataFrame(correctedText, columns=["score","sequence"])
62
  st.table(df)
63
 
64
 
 
1
  import streamlit as st
2
  import pandas as pd
3
 
4
+ from api.ModelMethods import generate
5
 
6
 
7
  st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
 
27
 
28
  st.header(appTitle)
29
  left_column, right_column = st.columns(2)
30
+ correctedText = None
31
 
32
  with left_column:
33
  model_options = {"mT5", "mBART", "VartaT5"}
 
41
  selected_example_text = examples[selected_example_key]
42
 
43
  # Get user input
44
+ user_input = st.text_area("Enter a Nepali Sentence: ", selected_example_text)
 
 
 
 
45
  if st.button("Check Spelling"):
46
  if user_input:
47
  correctedText = generate(selected_model, user_input)
 
54
  if correctedText is not None:
55
  st.write("Corrected Text:")
56
  # st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
57
+ df = pd.DataFrame(correctedText, columns=["score", "sequence"])
58
  st.table(df)
59
 
60
 
src/{ModelMethods.py → api/ModelMethods.py} RENAMED
File without changes
src/pages/LiteratureReview.py DELETED
@@ -1,8 +0,0 @@
1
- import streamlit as st
2
-
3
- st.set_page_config(
4
- page_title="Literature Review",
5
- page_icon="👋",
6
- )
7
-
8
- st.write("LiteratureReview")
 
 
 
 
 
 
 
 
 
src/pages/References.py DELETED
@@ -1,14 +0,0 @@
1
- import streamlit as st
2
-
3
- st.set_page_config(
4
- page_title="References",
5
- page_icon="👋",
6
- layout="wide"
7
- )
8
-
9
-
10
-
11
-
12
- st.sidebar.header("Plotting Demo")
13
-
14
- st.write("References Here")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/📈DatasetAnalysis.py ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ from collections import Counter
5
+ from wordcloud import WordCloud
6
+ from matplotlib.font_manager import FontProperties
7
+
8
+ st.set_page_config(page_title="Datasets Analysis", page_icon="👋", layout="wide")
9
+
10
+
11
+ data100k = "../datafiles/nep_spell_100k.csv"
12
+
13
+ # Preparing datafrmae
14
+ df = pd.read_csv(data100k)
15
+
16
+ # Count words
17
+ df["num_words"] = df["Correct"].apply(lambda x: len(x.split()))
18
+
19
+ # Count the number of sentences for each number of words
20
+ word_counts = df["num_words"].value_counts().sort_index()
21
+
22
+ # Create a Streamlit app
23
+ st.title("Dataset Analysis")
24
+
25
+ st.subheader("Word Count Analysis")
26
+ # Display the DataFrame (optional)
27
+ st.write(df)
28
+ st.write("---")
29
+ # Plot the data
30
+ plt.figure(figsize=(10, 6))
31
+ plt.bar(word_counts.index, word_counts.values, color="skyblue")
32
+ plt.xlabel("Number of Words in Sentence")
33
+ plt.ylabel("Number of Sentences")
34
+ plt.title("Number of Words vs. Number of Sentences")
35
+ plt.grid(True)
36
+
37
+ # Set the range in the x-axis to 70
38
+ plt.xlim(0, 70)
39
+
40
+ # Save the plot as an image file (optional)
41
+ # plt.savefig("word_count_plot.png", dpi=300)
42
+
43
+ # Display the plot in Streamlit
44
+ st.pyplot(plt)
45
+
46
+ st.write("---")
47
+
48
+ #########################
49
+ # Concatenate all sentences into a single string
50
+ all_sentences = " ".join(df["Correct"])
51
+
52
+ # Tokenize the sentences and calculate word frequency
53
+ words = all_sentences.split()
54
+ word_freq = Counter(words)
55
+
56
+ # Consider the top 1000 most common words
57
+ top_words = word_freq.most_common(1000)
58
+
59
+ # Generate the corpus for word cloud
60
+ corpus = {}
61
+ for word, frequency in top_words:
62
+ corpus[word] = frequency
63
+
64
+ # Define the font file path
65
+ font1 = "../fonts/TiroDevanagariHindi-Regular.ttf"
66
+
67
+ # Generate the word cloud
68
+ wordcloud_most_common = WordCloud(
69
+ width=1000,
70
+ height=500,
71
+ background_color="white",
72
+ min_font_size=10,
73
+ regexp=r"[\u0900-\u097F]+",
74
+ font_path=font1,
75
+ ).generate_from_frequencies(corpus)
76
+
77
+
78
+ # Display the word cloud using Streamlit
79
+ st.subheader("Word Cloud of Most Frequent Words in Correct Sentences")
80
+ st.image(wordcloud_most_common.to_array(), use_column_width=True)
81
+ ############################################
82
+ # WOrd cloud of least common
83
+ st.write("---")
84
+
85
+
86
+ # Concatenate all sentences into a single string
87
+
88
+ # Consider the least 1000 frequent words
89
+ least_common_words = word_freq.most_common()[: -1000 - 1 : -1]
90
+
91
+ # Generate the corpus for word cloud
92
+ corpus = {}
93
+ for word, frequency in least_common_words:
94
+ corpus[word] = frequency
95
+
96
+ # Generate the word cloud for least frequent words
97
+ wordcloud_least_frequent = WordCloud(
98
+ width=1000,
99
+ height=500,
100
+ background_color="white",
101
+ min_font_size=10,
102
+ regexp=r"[\u0900-\u097F]+",
103
+ font_path=font1,
104
+ ).generate_from_frequencies(corpus)
105
+
106
+ # Display the word cloud using Streamlit
107
+ st.header("Word Cloud of Least Frequent Words in Correct Sentences")
108
+ st.image(wordcloud_least_frequent.to_array(), use_column_width=True)
109
+
110
+
111
+ ########################################
112
+ st.write("---")
113
+
114
+ # Data
115
+ char_seq_in = [
116
+ "ि",
117
+ "ी",
118
+ "ु",
119
+ "ू",
120
+ "इ",
121
+ "ई",
122
+ "उ",
123
+ "ऊ",
124
+ "श",
125
+ "श",
126
+ "स",
127
+ "स",
128
+ "ष",
129
+ "ष",
130
+ "ब",
131
+ "व",
132
+ "त",
133
+ "ट",
134
+ "द",
135
+ "ध",
136
+ "ं",
137
+ "ँ",
138
+ ]
139
+ char_seq_out = [
140
+ "ी",
141
+ "ि",
142
+ "ू",
143
+ "ु",
144
+ "ई",
145
+ "इ",
146
+ "ऊ",
147
+ "उ",
148
+ "स",
149
+ "ष",
150
+ "श",
151
+ "ष",
152
+ "श",
153
+ "स",
154
+ "व",
155
+ "ब",
156
+ "ट",
157
+ "त",
158
+ "ध",
159
+ "द",
160
+ "ँ",
161
+ "ं",
162
+ ]
163
+ datapoints_in_percentage = [
164
+ 5,
165
+ 5,
166
+ 5,
167
+ 5,
168
+ 2.5,
169
+ 2.5,
170
+ 2.5,
171
+ 2.5,
172
+ 1.5,
173
+ 0.5,
174
+ 1.5,
175
+ 0.5,
176
+ 0.5,
177
+ 0.5,
178
+ 1,
179
+ 1,
180
+ 1,
181
+ 0.6,
182
+ 0.5,
183
+ 0.5,
184
+ 1,
185
+ 1,
186
+ ]
187
+
188
+ # Plot
189
+ plt.figure(figsize=(10, 6))
190
+ plt.bar(char_seq_in, datapoints_in_percentage, color="skyblue")
191
+ plt.xlabel("Character Sequence (Input)")
192
+ plt.ylabel("Percentage of Datapoints")
193
+ plt.title("Distribution of Character Substitution Errors")
194
+ # Specify font properties
195
+ font_prop = FontProperties(fname=font1)
196
+ plt.xticks(char_seq_in, char_seq_in, fontproperties=font_prop)
197
+
198
+ plt.grid(axis="y")
199
+
200
+ # Save the image
201
+ # plt.savefig("character_substitution.png", dpi=300, bbox_inches="tight")
202
+ # Show plot
203
+ plt.tight_layout()
204
+ # Display the plot in Streamlit
205
+ st.subheader("Character substitution error")
206
+ st.pyplot(plt)
207
+
208
+ ##################################
209
+
210
+ st.write("---")
211
+
212
+ # Existing data
213
+ characters = [
214
+ " ",
215
+ "ा",
216
+ "ि",
217
+ "ी",
218
+ "ु",
219
+ "ू",
220
+ "े",
221
+ "ै",
222
+ "ो",
223
+ "ौ",
224
+ "ृ",
225
+ "्",
226
+ "ः",
227
+ "क",
228
+ "ख",
229
+ "ग",
230
+ "घ",
231
+ "ङ",
232
+ "च",
233
+ "छ",
234
+ "ज",
235
+ "झ",
236
+ "ञ",
237
+ "ट",
238
+ "ठ",
239
+ "ड",
240
+ "ढ",
241
+ "ण",
242
+ "त",
243
+ "थ",
244
+ "द",
245
+ "ध",
246
+ "न",
247
+ "प",
248
+ "फ",
249
+ "ब",
250
+ "भ",
251
+ "म",
252
+ "य",
253
+ "र",
254
+ "ल",
255
+ "व",
256
+ "श",
257
+ "स",
258
+ "ष",
259
+ "ह",
260
+ "अ",
261
+ "आ",
262
+ "इ",
263
+ "ई",
264
+ "उ",
265
+ "ऊ",
266
+ "ऋ",
267
+ "ए",
268
+ "ऐ",
269
+ "ओ",
270
+ "औ",
271
+ ]
272
+ datapoints_in_percentage = [
273
+ 1.5,
274
+ 1.5,
275
+ 1.5,
276
+ 1.5,
277
+ 1.5,
278
+ 1.5,
279
+ 1,
280
+ 1,
281
+ 1,
282
+ 1,
283
+ 1.2,
284
+ 1,
285
+ 0.5,
286
+ 0.25,
287
+ 0.25,
288
+ 0.25,
289
+ 0.25,
290
+ 0.25,
291
+ 0.25,
292
+ 0.25,
293
+ 0.25,
294
+ 0.25,
295
+ 0.25,
296
+ 0.25,
297
+ 0.25,
298
+ 0.25,
299
+ 0.25,
300
+ 0.25,
301
+ 0.25,
302
+ 0.25,
303
+ 0.25,
304
+ 0.25,
305
+ 0.25,
306
+ 0.25,
307
+ 0.25,
308
+ 0.25,
309
+ 0.25,
310
+ 0.25,
311
+ 0.25,
312
+ 0.25,
313
+ 0.25,
314
+ 0.25,
315
+ 0.25,
316
+ 0.25,
317
+ 0.25,
318
+ 0.25,
319
+ 0.25,
320
+ 0.25,
321
+ 0.25,
322
+ 0.25,
323
+ 0.25,
324
+ 0.25,
325
+ 0.25,
326
+ 0.25,
327
+ 0.25,
328
+ 0.25,
329
+ 0.25,
330
+ ]
331
+
332
+ # Additional data
333
+ additional_characters = ["क्ष", "त्र", "ज्ञ", "अं", "अः"]
334
+ additional_datapoints_in_percentage = [0.15, 0.15, 0.15, 0.15, 0.15]
335
+
336
+ # Combine the existing and additional data
337
+ characters += additional_characters
338
+ datapoints_in_percentage += additional_datapoints_in_percentage
339
+
340
+ # Plot
341
+ plt.figure(figsize=(12, 6))
342
+ plt.bar(characters, datapoints_in_percentage, color="skyblue")
343
+ plt.xlabel("Character")
344
+ plt.ylabel("Percentage of Datapoints")
345
+ plt.title("Distribution of Character Additions Errors")
346
+ plt.xticks(rotation=90)
347
+
348
+ # Specify font properties
349
+ font_prop = FontProperties(fname=font1)
350
+ plt.xticks(characters, characters, fontproperties=font_prop)
351
+
352
+ plt.grid(axis="y")
353
+
354
+ # Save the image
355
+ # plt.savefig("character_addition.png", dpi=300, bbox_inches="tight")
356
+
357
+ # Show plot
358
+ plt.tight_layout()
359
+ st.subheader("Character Addition Error")
360
+ st.pyplot(plt)
361
+ ############################################################
362
+
363
+ st.write("---")
364
+
365
+ # Data
366
+ characters = [
367
+ " ",
368
+ "ा",
369
+ "ि",
370
+ "ी",
371
+ "ु",
372
+ "ू",
373
+ "े",
374
+ "ै",
375
+ "ो",
376
+ "ौ",
377
+ "ृ",
378
+ "्",
379
+ "ः",
380
+ "क",
381
+ "ख",
382
+ "ग",
383
+ "घ",
384
+ "ङ",
385
+ "च",
386
+ "छ",
387
+ "ज",
388
+ "झ",
389
+ "ञ",
390
+ "ट",
391
+ "ठ",
392
+ "ड",
393
+ "ढ",
394
+ "ण",
395
+ "त",
396
+ "थ",
397
+ "द",
398
+ "ध",
399
+ "न",
400
+ "प",
401
+ "फ",
402
+ "ब",
403
+ "भ",
404
+ "म",
405
+ "य",
406
+ "र",
407
+ "ल",
408
+ "व",
409
+ "श",
410
+ "स",
411
+ "ष",
412
+ "ह",
413
+ "अ",
414
+ "आ",
415
+ "इ",
416
+ "ई",
417
+ "उ",
418
+ "ऊ",
419
+ "ऋ",
420
+ "ए",
421
+ "ऐ",
422
+ "ओ",
423
+ "औ",
424
+ "क्ष",
425
+ "त्र",
426
+ "ज्ञ",
427
+ "अं",
428
+ "अः",
429
+ ]
430
+ datapoints_in_percentage = [
431
+ 1.5,
432
+ 1.5,
433
+ 1.5,
434
+ 1.5,
435
+ 1.5,
436
+ 1.5,
437
+ 1,
438
+ 1,
439
+ 1,
440
+ 1,
441
+ 1,
442
+ 1.25,
443
+ 0.5,
444
+ 0.25,
445
+ 0.25,
446
+ 0.25,
447
+ 0.25,
448
+ 0.25,
449
+ 0.25,
450
+ 0.25,
451
+ 0.25,
452
+ 0.25,
453
+ 0.25,
454
+ 0.25,
455
+ 0.25,
456
+ 0.25,
457
+ 0.25,
458
+ 0.25,
459
+ 0.25,
460
+ 0.25,
461
+ 0.25,
462
+ 0.25,
463
+ 0.25,
464
+ 0.25,
465
+ 0.25,
466
+ 0.25,
467
+ 0.25,
468
+ 0.25,
469
+ 0.25,
470
+ 0.25,
471
+ 0.25,
472
+ 0.25,
473
+ 0.25,
474
+ 0.25,
475
+ 0.25,
476
+ 0.25,
477
+ 0.25,
478
+ 0.25,
479
+ 0.25,
480
+ 0.25,
481
+ 0.25,
482
+ 0.25,
483
+ 0.25,
484
+ 0.25,
485
+ 0.25,
486
+ 0.25,
487
+ 0.25,
488
+ 0.15,
489
+ 0.15,
490
+ 0.15,
491
+ 0.15,
492
+ 0.15,
493
+ ]
494
+
495
+ # Plot
496
+ plt.figure(figsize=(10, 6))
497
+ plt.bar(characters, datapoints_in_percentage, color="skyblue")
498
+ plt.xlabel("Character")
499
+ plt.ylabel("Percentage of Datapoints")
500
+ plt.title("Distribution of Character Deletion Errors")
501
+ plt.xticks(rotation=90)
502
+
503
+ # Specify font properties
504
+ font_prop = FontProperties(fname=font1)
505
+ plt.xticks(characters, characters, fontproperties=font_prop)
506
+
507
+ plt.grid(axis="y")
508
+
509
+ # Save the image
510
+ # plt.savefig("character_deletion.png", dpi=300, bbox_inches="tight")
511
+
512
+ # Show plot
513
+ plt.tight_layout()
514
+
515
+ st.subheader("Character Deletion Error")
516
+ st.pyplot(plt)
517
+ ############################################
518
+
519
+
520
+ st.write("---")
521
+
522
+ # Data
523
+ error_types = ["Deletion", "Addition", "Substitution", "Double Substitution"]
524
+ error_percentages = [28.5, 28.45, 40.1, 2.95]
525
+
526
+ # Create horizontal bar graph
527
+ plt.figure(figsize=(10, 6))
528
+ plt.barh(error_types, error_percentages)
529
+
530
+ # Add labels and title
531
+ plt.xlabel("Error Percentage")
532
+ plt.ylabel("Error Type")
533
+ plt.title("Error Types Distribution")
534
+
535
+ # Save the image
536
+ # plt.savefig("error_type_distribution.png", dpi=300, bbox_inches="tight")
537
+
538
+ # Show plot
539
+ st.subheader("Distribution of Error Types")
540
+ st.pyplot(plt)
src/pages/📊DatasetsPreparation.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ st.set_page_config(page_title="Datasets Preparation", page_icon="👋", layout="wide")
5
+
6
+ ##########################################
7
+
8
+ # Read lines from the text file
9
+ with open("../datafiles/sample_nep_corpus.txt") as file:
10
+ items = file.readlines()
11
+
12
+ # Split each line into separate columns
13
+ datacorpus = pd.DataFrame(items, columns=["Content"])
14
+ # datacorpus.columns =["Content"]
15
+
16
+
17
+ # st.write(f"{datacorpus}")
18
+
19
+ datasentences = pd.read_csv("../datafiles/sample_nep_sentences.csv")
20
+
21
+ data100k = pd.read_csv(
22
+ r"../datafiles/sample_nep_spell_100k.csv",
23
+ nrows=50,
24
+ )
25
+
26
+
27
+ ###########################################
28
+
29
+
30
+ st.title("Dataset Preparation")
31
+
32
+ st.write("---")
33
+ st.header(
34
+ """
35
+ A Large Nepali Text Corpus
36
+ """
37
+ )
38
+
39
+ st.caption("**Table 1.** A Large Nepali Text Corpus")
40
+
41
+ st.dataframe(datacorpus, use_container_width=True)
42
+
43
+ st.write("---")
44
+ st.header(
45
+ """
46
+ Sentence extrancted from A Large Nepali Text Corpus
47
+ """
48
+ )
49
+ st.caption("**Table 2.** Extracted sentences")
50
+ st.dataframe(datasentences, use_container_width=True)
51
+
52
+ st.write("---")
53
+ st.header(
54
+ """
55
+ Parallel dataset using extracted sentences
56
+ """
57
+ )
58
+ st.caption("**Table 3.** 100k Dataset used for training")
59
+ st.dataframe(data100k, use_container_width=True)