duraad commited on
Commit
1ddad36
0 Parent(s):

Initial Commit

Browse files
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Folders to ignore
2
+ model/
3
+ model-local/
4
+ __pycache__/
5
+ src/__pycache__/
6
+
7
+ # Files to ignore
8
+ notes.md
9
+ *.pyc
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Nepali Spelling Correction
2
+
3
+ ## Models used
4
+
5
+ - `google/mt5-small`
6
+ - `facebook/mbart-large-cc25`
7
+ - `rahular/varta-t5`
8
+
9
+
10
+ ## How to setup?
11
+ 1. Clone this repo
12
+ 2. Install the dependencies
13
+ 2. Create a folder `models` inside the repo
14
+ 3. Inside the `models` repo, `clone` the models from huggingface
15
+ 4. Update the model names in `ModelMethods.py`
requirments.txt ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.0.0
2
+ accelerate @ git+https://github.com/huggingface/accelerate.git@162a82164e9bdcc01a173cbee43b686437aaead8
3
+ aiohttp==3.8.4
4
+ aiosignal==1.3.1
5
+ altair==4.2.0
6
+ altgraph==0.17.3
7
+ aniso8601==9.0.1
8
+ annotated-types==0.6.0
9
+ anyio==4.2.0
10
+ appdirs==1.4.4
11
+ argon2-cffi==23.1.0
12
+ argon2-cffi-bindings==21.2.0
13
+ arrow==1.3.0
14
+ asgiref==3.5.0
15
+ asttokens==2.4.1
16
+ astunparse==1.6.3
17
+ async-lru==2.0.4
18
+ async-timeout==4.0.2
19
+ attrs==22.1.0
20
+ autopep8==1.6.0
21
+ Babel==2.14.0
22
+ backports.csv==1.0.7
23
+ backports.entry-points-selectable==1.1.1
24
+ beautifulsoup4==4.10.0
25
+ bitsandbytes==0.42.0
26
+ bleach==6.1.0
27
+ blinker==1.5
28
+ blis==0.7.11
29
+ boto3==1.34.19
30
+ botocore==1.34.19
31
+ branca==0.7.0
32
+ cachetools==5.2.0
33
+ catalogue==2.0.10
34
+ certifi==2022.6.15
35
+ cffi==1.15.1
36
+ chardet==4.0.0
37
+ charset-normalizer==2.1.1
38
+ cheroot==8.6.0
39
+ CherryPy==18.6.1
40
+ click==8.1.3
41
+ cloudpathlib==0.16.0
42
+ colorama==0.4.5
43
+ comm==0.2.0
44
+ commonmark==0.9.1
45
+ confection==0.1.4
46
+ contextualSpellCheck==0.4.4
47
+ contourpy==1.0.6
48
+ cryptography==38.0.1
49
+ cycler==0.11.0
50
+ cymem==2.0.8
51
+ Cython==3.0.6
52
+ datasets==2.16.1
53
+ dateparser==1.1.0
54
+ debugpy==1.8.0
55
+ decorator==5.1.1
56
+ defusedxml==0.7.1
57
+ dill==0.3.7
58
+ distlib==0.3.4
59
+ dj-database-url==0.5.0
60
+ Django==4.0.4
61
+ django-ckeditor==6.2.0
62
+ django-crispy-forms==1.14.0
63
+ django-heroku==0.3.1
64
+ django-js-asset==1.2.2
65
+ docker-pycreds==0.4.0
66
+ docopt==0.6.2
67
+ docutils==0.20.1
68
+ docx2pdf==0.1.8
69
+ editdistance==0.6.2
70
+ einops==0.7.0
71
+ entrypoints==0.4
72
+ et-xmlfile==1.1.0
73
+ evaluate==0.4.0
74
+ exceptiongroup==1.2.0
75
+ executing==2.0.1
76
+ ez-setup==0.9
77
+ fastjsonschema==2.19.1
78
+ feedparser==6.0.8
79
+ filelock==3.4.0
80
+ Flask==2.2.2
81
+ Flask-API==3.0.post1
82
+ Flask-Cors==3.0.10
83
+ Flask-RESTful==0.3.9
84
+ Flask-SQLAlchemy==2.5.1
85
+ flatbuffers==23.5.26
86
+ fonttools==4.38.0
87
+ fqdn==1.5.1
88
+ frozenlist==1.3.3
89
+ fsspec==2023.10.0
90
+ future==0.18.2
91
+ gast==0.5.4
92
+ gitdb==4.0.10
93
+ GitPython==3.1.29
94
+ google==3.0.0
95
+ google-auth==2.25.2
96
+ google-auth-oauthlib==1.2.0
97
+ google-pasta==0.2.0
98
+ GoogleNews==1.6.0
99
+ greenlet==1.1.3
100
+ grpcio==1.60.0
101
+ gunicorn==20.1.0
102
+ h5py==3.10.0
103
+ happytransformer==3.0.0
104
+ heroku==0.1.4
105
+ huggingface-hub==0.20.1
106
+ idna==3.3
107
+ imageio==2.19.3
108
+ imageio-ffmpeg==0.4.7
109
+ importlib-metadata==5.1.0
110
+ instaloader==4.9.6
111
+ ipykernel==6.28.0
112
+ ipyleaflet==0.18.1
113
+ ipython==8.19.0
114
+ ipywidgets==8.1.1
115
+ isoduration==20.11.0
116
+ itsdangerous==2.1.2
117
+ jaraco.classes==3.2.1
118
+ jaraco.collections==3.5.1
119
+ jaraco.context==4.1.1
120
+ jaraco.functools==3.5.0
121
+ jaraco.text==3.7.0
122
+ jedi==0.19.1
123
+ Jinja2==3.1.2
124
+ jmespath==1.0.1
125
+ joblib==1.3.2
126
+ json5==0.9.14
127
+ jsonlines==4.0.0
128
+ jsonpointer==2.4
129
+ jsonschema==4.17.3
130
+ jsonschema-specifications==2023.12.1
131
+ jupyter-events==0.9.0
132
+ jupyter-lsp==2.2.1
133
+ jupyter_client==8.6.0
134
+ jupyter_core==5.5.1
135
+ jupyter_server==2.12.1
136
+ jupyter_server_terminals==0.5.1
137
+ jupyterlab==4.0.10
138
+ jupyterlab-widgets==3.0.9
139
+ jupyterlab_pygments==0.3.0
140
+ jupyterlab_server==2.25.2
141
+ jwt==1.3.1
142
+ keras==2.15.0
143
+ keyring==24.3.0
144
+ kiwisolver==1.4.4
145
+ langcodes==3.3.0
146
+ Levenshtein==0.23.0
147
+ libclang==16.0.6
148
+ loralib==0.1.2
149
+ lxml==4.9.1
150
+ Markdown==3.5.1
151
+ markdown-it-py==3.0.0
152
+ MarkupSafe==2.1.1
153
+ matplotlib==3.7.1
154
+ matplotlib-inline==0.1.6
155
+ mdurl==0.1.2
156
+ mistune==3.0.2
157
+ ml-dtypes==0.2.0
158
+ more-itertools==8.12.0
159
+ moviepy==1.0.3
160
+ mpmath==1.3.0
161
+ multidict==6.0.4
162
+ multiprocess==0.70.15
163
+ multitasking==0.0.11
164
+ murmurhash==1.0.10
165
+ mysql-connector-python==8.0.31
166
+ mysqlclient==2.1.0
167
+ nbclient==0.9.0
168
+ nbconvert==7.13.1
169
+ nbformat==5.9.2
170
+ nest-asyncio==1.5.8
171
+ networkx==3.2.1
172
+ news==1.0
173
+ nh3==0.2.15
174
+ nltk==3.7
175
+ notebook_shim==0.2.3
176
+ numpy==1.23.5
177
+ oauthlib==3.2.2
178
+ openai==0.27.2
179
+ openpyxl==3.1.2
180
+ opt-einsum==3.3.0
181
+ overrides==7.4.0
182
+ packaging==21.3
183
+ pafy==0.5.5
184
+ pandas==1.5.2
185
+ pandocfilters==1.5.0
186
+ parso==0.8.3
187
+ Pattern==3.6
188
+ pdfminer.six==20211012
189
+ pefile==2023.2.7
190
+ peft==0.6.0
191
+ Pillow==9.3.0
192
+ pipreqs==0.4.11
193
+ pkginfo==1.9.6
194
+ platformdirs==4.1.0
195
+ portalocker==2.8.2
196
+ portend==3.1.0
197
+ preshed==3.0.9
198
+ proglog==0.1.10
199
+ prometheus-client==0.19.0
200
+ prompt-toolkit==3.0.43
201
+ protobuf==3.20.1
202
+ psutil==5.9.7
203
+ psycopg2==2.9.3
204
+ pure-eval==0.2.2
205
+ pyarrow==10.0.1
206
+ pyarrow-hotfix==0.6
207
+ pyasn1==0.4.8
208
+ pyasn1-modules==0.3.0
209
+ pycodestyle==2.8.0
210
+ pycparser==2.21
211
+ pydantic==2.5.3
212
+ pydantic_core==2.14.6
213
+ pydeck==0.8.0
214
+ Pygments==2.13.0
215
+ pyinstaller==5.13.0
216
+ pyinstaller-hooks-contrib==2023.6
217
+ PyJWT==2.4.0
218
+ Pympler==1.0.1
219
+ PyMuPDF==1.23.12
220
+ PyMuPDFb==1.23.9
221
+ pyparsing==3.0.9
222
+ PyQt5==5.15.10
223
+ PyQt5-Qt5==5.15.2
224
+ PyQt5-sip==12.13.0
225
+ pyrsistent==0.19.2
226
+ python-dateutil==2.8.2
227
+ python-docx==0.8.11
228
+ python-dotenv==1.0.0
229
+ python-json-logger==2.0.7
230
+ pytorch-pretrained-bert==0.6.2
231
+ pytube==12.1.0
232
+ pytz==2022.2.1
233
+ pytz-deprecation-shim==0.1.0.post0
234
+ pywin32==306
235
+ pywin32-ctypes==0.2.2
236
+ pywinpty==2.0.12
237
+ PyYAML==6.0.1
238
+ pyzmq==25.1.2
239
+ rapidfuzz==3.6.1
240
+ readme-renderer==42.0
241
+ referencing==0.32.0
242
+ regex==2021.11.10
243
+ requests==2.28.1
244
+ requests-oauthlib==1.3.1
245
+ requests-toolbelt==1.0.0
246
+ responses==0.18.0
247
+ rfc3339-validator==0.1.4
248
+ rfc3986==2.0.0
249
+ rfc3986-validator==0.1.1
250
+ rich==12.6.0
251
+ rouge-score==0.1.2
252
+ rpds-py==0.16.2
253
+ rsa==4.8
254
+ s3transfer==0.10.0
255
+ safetensors==0.4.1
256
+ scikit-learn==1.4.0
257
+ scipy==1.8.0
258
+ seaborn==0.13.0
259
+ semver==2.13.0
260
+ Send2Trash==1.8.2
261
+ sentencepiece==0.1.99
262
+ sentry-sdk==1.39.2
263
+ setproctitle==1.3.3
264
+ sgmllib3k==1.0.0
265
+ six==1.16.0
266
+ smart-open==6.4.0
267
+ smmap==5.0.0
268
+ sniffio==1.3.0
269
+ soupsieve==2.3.1
270
+ spacy==3.7.2
271
+ spacy-legacy==3.0.12
272
+ spacy-loggers==1.0.5
273
+ SQLAlchemy==1.4.41
274
+ sqlparse==0.4.2
275
+ srsly==2.4.8
276
+ stack-data==0.6.3
277
+ streamlit==1.15.1
278
+ streamlit-menu==1.0.9
279
+ streamlit-option-menu==0.3.12
280
+ sympy==1.12
281
+ tempora==5.0.1
282
+ tenacity==8.2.3
283
+ tensorboard==2.15.1
284
+ tensorboard-data-server==0.7.2
285
+ tensorflow==2.15.0
286
+ tensorflow-estimator==2.15.0
287
+ tensorflow-intel==2.15.0
288
+ tensorflow-io-gcs-filesystem==0.31.0
289
+ termcolor==2.4.0
290
+ terminado==0.18.0
291
+ test-nep-spell-synthetic-datautils==0.1.0
292
+ thinc==8.2.2
293
+ threadpoolctl==3.2.0
294
+ tinycss2==1.2.1
295
+ tokenizers==0.15.1
296
+ toml==0.10.2
297
+ tomli==2.0.1
298
+ toolz==0.12.0
299
+ torch==1.13.1
300
+ torchdata==0.5.1
301
+ tornado==6.2
302
+ tqdm==4.63.0
303
+ traitlets==5.14.0
304
+ traittypes==0.2.1
305
+ transformers @ git+https://github.com/huggingface/transformers.git@5b5e71dc41734a9798f3535bbd5039ab91883079
306
+ twine==5.0.0
307
+ typer==0.9.0
308
+ types-python-dateutil==2.8.19.14
309
+ typing_extensions==4.4.0
310
+ tzdata==2022.7
311
+ tzlocal==4.2
312
+ uri-template==1.3.0
313
+ urllib3==1.26.12
314
+ validators==0.20.0
315
+ virtualenv==20.10.0
316
+ wandb==0.16.2
317
+ wasabi==1.1.2
318
+ watchdog==2.1.9
319
+ wcwidth==0.2.12
320
+ weasel==0.3.4
321
+ webcolors==1.13
322
+ webencodings==0.5.1
323
+ websocket-client==1.7.0
324
+ Werkzeug==2.2.2
325
+ whitenoise==6.0.0
326
+ widgetsnbextension==4.0.9
327
+ wrapt==1.14.1
328
+ xxhash==3.4.1
329
+ xyzservices==2023.10.1
330
+ yarg==0.1.9
331
+ yarl==1.8.2
332
+ yfinance==0.1.87
333
+ zc.lockfile==2.0
334
+ zipp==3.11.0
src/Demo.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ from ModelMethods import generate
5
+
6
+
7
+ st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
8
+
9
+
10
+ # Make basic configuration for the app
11
+ appTitle = "Nepali Spell Correction"
12
+
13
+
14
+ # Some test examples here
15
+ example = (
16
+ "अबको स्थायी कमिटी ओली सरकारलाई दीएको समर्थन फिर्ताको तयारि रहेको साहले जानकारी दिए।"
17
+ )
18
+ examples = {
19
+ "Examples": "",
20
+ "अखिलेस झा धेरै दिनदेखि अनुपस्थीत थिए ।": "अखिलेस झा धेरै दिनदेखि अनुपस्थीत थिए ।",
21
+ "आठौँ तह उपनिर्देषक पदमा दुई जना उत्तीर्ण भएका छन्।": "आठौँ तह उपनिर्देषक पदमा दुई जना उत्तीर्ण भएका छन्।",
22
+ "उनीहरूमा रोगसँग लड्ने क्षमता मज्जाले बिकसित भइसकेको हुँदैन।": "उनीहरूमा रोगसँग लड्ने क्षमता मज्जाले बिकसित भइसकेको हुँदैन।",
23
+ }
24
+
25
+
26
+ def main():
27
+
28
+ st.header(appTitle)
29
+ left_column, right_column = st.columns(2)
30
+ correctedText= None
31
+
32
+ with left_column:
33
+ model_options = {"mT5", "mBART", "VartaT5"}
34
+
35
+ # Display the radio options in a single line
36
+ selected_model = st.radio("Select the model", model_options, index=0)
37
+
38
+ # Create a dropdown menu
39
+ selected_example_key = st.selectbox("Select an example", list(examples.keys()))
40
+ # Display the selected example text in a text area
41
+ selected_example_text = examples[selected_example_key]
42
+
43
+ # Get user input
44
+ user_input = st.text_area(
45
+ "Enter a Nepali Sentence: ",
46
+ selected_example_text,
47
+ max_chars=512, # Set the maximum input length to 512 characters
48
+ )
49
+ if st.button("Check Spelling"):
50
+ if user_input:
51
+ correctedText = generate(selected_model, user_input)
52
+ # # Perfrom grammer correction
53
+ # st.subheader("Corrected Text:")
54
+ # st.write([f"{line['score']:.2f}: {line['sequence']}"for line in correctedText])
55
+ else:
56
+ st.warning("Please enter some text to check.")
57
+ with right_column:
58
+ if correctedText is not None:
59
+ st.write("Corrected Text:")
60
+ # st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
61
+ df = pd.DataFrame(correctedText, columns=["score","sequence"])
62
+ st.table(df)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
src/ModelMethods.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import HappyTextToText from Happy Transformer
2
+ from happytransformer import HappyTextToText, TTSettings
3
+
4
+ # Huggingface Transformers
5
+ from transformers import (
6
+ MT5ForConditionalGeneration,
7
+ MT5Tokenizer,
8
+ MBartForConditionalGeneration,
9
+ MBartTokenizer,
10
+ T5ForConditionalGeneration,
11
+ T5TokenizerFast,
12
+ GenerationConfig,
13
+ )
14
+
15
+ import torch
16
+ import re
17
+
18
+
19
+ """
20
+ Some global variables
21
+ Add path to the models here
22
+ """
23
+ mt5ModelPath = "../models/nep-spell-hft-23epochs"
24
+ mbartModelPath = "../models/happytt_mBART_plus_10"
25
+ vartat5ModelPath = "../models/vartat5-using-100K-plus-1"
26
+
27
+
28
+ """
29
+ Function: generate
30
+
31
+ This function takes a model name and input text as parameters and
32
+ returns the output text generated by the specified model.
33
+ It supports multiple models such as mT5, mBART, and VartaT5.
34
+ If the specified model is not available,
35
+ it returns a message indicating the unavailability of the model.
36
+
37
+ Parameters:
38
+ - model (str): Name of the model to use for text generation.
39
+ - input (str): Input text for the model to generate output from.
40
+
41
+ Returns:
42
+ - str: Output text generated by the specified model or a message indicating model unavailability.
43
+ """
44
+
45
+
46
+ def generate(model, input):
47
+
48
+ if model == "mT5":
49
+ return mt5Inference(input)
50
+ elif model == "mBART":
51
+ return mbartInference(input)
52
+ elif model == "VartaT5":
53
+ return vartat5Inference(input)
54
+ else:
55
+ return f"Model: {model} not available"
56
+
57
+ # काकाले काकिलाइ माया गर्नू हुन्छ।
58
+
59
+
60
+
61
+ """
62
+ Below are the 3 different models for inference
63
+ """
64
+ def mt5Inference(input):
65
+ print("Processing mt5")
66
+
67
+ model = MT5ForConditionalGeneration.from_pretrained(mt5ModelPath)
68
+ tokenizer = MT5Tokenizer.from_pretrained(mt5ModelPath)
69
+ input_ids = tokenizer("grammar: " + input, return_tensors="pt").input_ids
70
+ outputs = model.generate(
71
+ input_ids=input_ids,
72
+ max_length=512,
73
+ num_beams=5,
74
+ num_return_sequences=5,
75
+ return_dict_in_generate=True,
76
+ output_scores=True,
77
+ )
78
+ sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
79
+ return postProcessOutput(sequences,outputs["sequences_scores"])
80
+
81
+
82
+ def mbartInference(input):
83
+ print("Processing mbart")
84
+ tokenizer = MBartTokenizer.from_pretrained(
85
+ mbartModelPath, src_lang="ne_NP", tgt_lang="ne_NP"
86
+ )
87
+ model = MBartForConditionalGeneration.from_pretrained(mbartModelPath)
88
+ inputs = tokenizer("grammar: " + input, return_tensors="pt")
89
+ outputs = model.generate(
90
+ **inputs,
91
+ decoder_start_token_id=tokenizer.lang_code_to_id["ne_NP"],
92
+ max_length=512,
93
+ num_beams=5,
94
+ num_return_sequences=5,
95
+ return_dict_in_generate=True,
96
+ output_scores=True,
97
+ )
98
+ sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
99
+ return postProcessOutput(sequences, outputs["sequences_scores"])
100
+ # return outputs
101
+
102
+
103
+ def vartat5Inference(input):
104
+ print("Processing varta")
105
+ model = T5ForConditionalGeneration.from_pretrained(vartat5ModelPath)
106
+ # return "model ready"
107
+ tokenizer = T5TokenizerFast.from_pretrained(vartat5ModelPath)
108
+ input_ids = tokenizer("grammar: " + input, return_tensors="pt")
109
+ outputs = model.generate(
110
+ **input_ids,
111
+ max_length=512,
112
+ num_beams=5,
113
+ num_return_sequences=5,
114
+ return_dict_in_generate=True,
115
+ output_scores=True,
116
+ )
117
+ sequences = tokenizer.batch_decode(outputs["sequences"], skip_special_tokens=True)
118
+ return postProcessOutput(sequences,outputs["sequences_scores"])
119
+
120
+
121
+
122
+ """
123
+ Post processing the model output
124
+ """
125
+
126
+ def postProcessOutput(sequences, sequences_scores):
127
+ probabilities = torch.exp(sequences_scores)
128
+ unique_sequences = set()
129
+ # Initialize the list to store filtered items
130
+ filtered_outputs = []
131
+
132
+ # Iterate through sequences and formatted_scores
133
+ for sequence, score in zip(sequences, probabilities):
134
+ # Check if the sequence is not in the set of unique sequences
135
+ if sequence not in unique_sequences:
136
+ # Add the sequence to the set of unique sequences
137
+ unique_sequences.add(sequence)
138
+ # Append the sequence and score to the filtered_outputs list
139
+ filtered_outputs.append({"sequence": sequence, "score": score.item()})
140
+
141
+ return filtered_outputs
142
+
143
+
144
+ """
145
+ For working with paragraph processing
146
+ """
147
+
148
+ def split_nepali_paragraph_into_sentences(nepali_text):
149
+
150
+ # Define a regex pattern to split sentences
151
+ # We'll split on periods, question marks, and exclamation marks
152
+ sentence_pattern = r"(?<=[।?!\n])\s+"
153
+
154
+ # Split the Nepali text into sentences
155
+ sentences = re.split(sentence_pattern, nepali_text)
156
+
157
+ return sentences
158
+
159
+
160
+ def process_paragraph(model, paragraph):
161
+ sentenceList = split_nepali_paragraph_into_sentences(paragraph)
162
+ out_sentence = []
163
+ for s in sentenceList:
164
+ out_sentence.append(generate(model, s))
165
+ nepali_paragraph = " ".join(out_sentence)
166
+ return nepali_paragraph
src/pages/LiteratureReview.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="Literature Review",
5
+ page_icon="👋",
6
+ )
7
+
8
+ st.write("LiteratureReview")
src/pages/References.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="References",
5
+ page_icon="👋",
6
+ layout="wide"
7
+ )
8
+
9
+
10
+
11
+
12
+ st.sidebar.header("Plotting Demo")
13
+
14
+ st.write("References Here")