Harsh502s commited on
Commit
ade606e
β€’
1 Parent(s): 94d512b
Models/bin.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0a4fbec22e1a6e06396e8b1c384d5d541b6c0dfd2cec61a8c9a4f7e1179db0c
3
- size 756
 
 
 
 
Models/stackexchange_topic_model.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:81f72e9da968496087c2dbe77cc06e1937789099c7b69380e9cebd5ab0a357f8
3
- size 438242069
 
 
 
 
Models/tag_model.h5 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b304a86faeda3cd1ff63af09e57bec8ba0c98d5bdb30613e7fcb08ee1f57b9c
3
- size 77937800
 
 
 
 
Models/token.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:72c3d62f1cc157956c2131619934ced13e82052e23fa4efe60f104a6632d2a5c
3
- size 1961509
 
 
 
 
app.py DELETED
@@ -1,37 +0,0 @@
1
- import streamlit as st
2
- from st_pages import Page, show_pages
3
-
4
- # should be
5
- show_pages(
6
- [
7
- Page(r"app.py", "Home", "🏠"),
8
- Page(r"Pages/1_πŸ“Š_Topic Model Results.py", 'Topic Model Result',"πŸ“Š"),
9
- Page(r"Pages/2_πŸ€–_Models.py", "Models", "πŸ€–"),
10
- Page(r"Pages/3_πŸ‘‹_About.py", "About", "πŸ‘‹"),
11
- ]
12
- )
13
-
14
- st.set_page_config(
15
- page_title="Autonomous Text Tagging App",
16
- page_icon="πŸ“",
17
- layout="wide",
18
- initial_sidebar_state="expanded",
19
- )
20
-
21
-
22
- # Display the main page of the app with instructions on how to use it
23
- def main():
24
- st.title("Autonomous Text Tagging App")
25
- st.subheader(
26
- "This application shows a demo of different supervised and unsupervised approches taken in the field of NLP to give relevant tags to the text."
27
- )
28
- st.subheader("This is a multi-page app.")
29
- st.write("1. You can navigate between pages by clicking on the sidebar.")
30
- st.write("2. The Topic Modeling Results page shows the results of BERTopic.")
31
- st.write("3. The Model page give a demo of all the models used in this app.")
32
- st.write("4. The About page gives information about the creator, code, and data.")
33
- st.divider()
34
-
35
-
36
- if __name__ == "__main__":
37
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/1_πŸ“Š_Topic Model Results.py DELETED
@@ -1,41 +0,0 @@
1
- import streamlit as st
2
- from bertopic import BERTopic
3
-
4
-
5
- @st.cache_resource
6
- def load_model():
7
- return BERTopic.load(r"Models/stackexchange_topic_model.pkl")
8
-
9
-
10
- bertopic_model = load_model()
11
-
12
-
13
- def topic_model_results():
14
- st.title("Topic Model Results")
15
- tab1, tab2, tab3, tab4, tab5 = st.tabs(
16
- [
17
- "Topic Word Score",
18
- "Intertopic Distance Map",
19
- "Topic Probability Distribution",
20
- "Visualize Hierarchical Topics",
21
- "Visualize Topics Heatmap",
22
- ]
23
- )
24
- with tab1:
25
- st.write(bertopic_model.visualize_barchart(top_n_topics=20))
26
- with tab2:
27
- st.write(bertopic_model.visualize_topics())
28
- with tab3:
29
- st.write(
30
- bertopic_model.visualize_distribution(
31
- bertopic_model.probabilities_[0], min_probability=0.015
32
- )
33
- )
34
- with tab4:
35
- st.write(bertopic_model.visualize_hierarchy())
36
- with tab5:
37
- st.write(bertopic_model.visualize_heatmap())
38
-
39
-
40
- if __name__ == "__main__":
41
- topic_model_results()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/2_πŸ€–_Models.py DELETED
@@ -1,191 +0,0 @@
1
- import streamlit as st
2
- from streamlit_extras.tags import tagger_component
3
- import re
4
- import pickle
5
- from keybert import KeyBERT
6
- from bertopic import BERTopic
7
- from keras.models import load_model
8
- from keras.preprocessing.sequence import pad_sequences
9
-
10
-
11
- # Load the BERTopic model
12
- @st.cache_resource
13
- def load_models():
14
- return (
15
- BERTopic.load(r"Models/stackexchange_topic_model.pkl"),
16
- KeyBERT("all-MiniLM-L6-v2"),
17
- load_model(r"Models/tag_model.h5"),
18
- pickle.load(open(r"Models/token.pkl", "rb")),
19
- pickle.load(open(r"Models/bin.pkl", "rb")),
20
- )
21
-
22
-
23
- # Load the model into memory
24
- bertopic_model, keybert_model, cnn_model, tokenizer, binarizer = load_models()
25
-
26
-
27
- # Clean the input text
28
- def clean_text(text):
29
- text = re.sub(r"<.*?>", "", text)
30
- text = re.sub(r"[^A-Za-z']", " ", text)
31
- text = re.sub(r"\s+", " ", text)
32
- return text
33
-
34
-
35
- # Assign tags to the input text using the CNN model
36
- def tag_cnn_model(text):
37
- text = clean_text(text)
38
- text = tokenizer.texts_to_sequences([text])
39
- text_padded = pad_sequences(text, maxlen=512)
40
- q_pred = cnn_model.predict(text_padded)
41
- q_pred = (q_pred >= 0.25).astype(int)
42
- return binarizer.inverse_transform(q_pred)
43
-
44
-
45
- # Retrieve the keyphrases from the input text using the KeyBERT model
46
- def retrieve_keyphrases(text, n, ngram_range):
47
- keywords = keybert_model.extract_keywords(
48
- text,
49
- keyphrase_ngram_range=ngram_range,
50
- top_n=n,
51
- diversity=0.5,
52
- use_maxsum=True,
53
- use_mmr=True,
54
- seed_keywords=[
55
- "machine-learning",
56
- "r",
57
- "regression",
58
- "deep-learning",
59
- "neural-networks",
60
- "data-request",
61
- "python",
62
- "reinforcement-learning",
63
- "classification",
64
- "time-series",
65
- "probability",
66
- "neural-network",
67
- "distributions",
68
- "bayesian",
69
- "hypothesis-testing",
70
- "keras",
71
- "mathematical-statistics",
72
- "scikit-learn",
73
- "logistic",
74
- "convolutional-neural-networks",
75
- "clustering",
76
- "tensorflow",
77
- "terminology",
78
- "nlp",
79
- "correlation",
80
- "self-study",
81
- "normal-distribution",
82
- "geospatial",
83
- "cross-validation",
84
- "optimization",
85
- "random-forest",
86
- "mixed-model",
87
- "data-mining",
88
- "feature-selection",
89
- "pca",
90
- "references",
91
- "computer-vision",
92
- "data-visualization",
93
- "confidence-interval",
94
- "generalized-linear-model",
95
- "variance",
96
- "natural-language-processing",
97
- "dataset",
98
- "svm",
99
- "training",
100
- "maximum-likelihood",
101
- "statistical-significance",
102
- "gradient-descent",
103
- "multiple-regression",
104
- "estimation",
105
- ],
106
- )
107
- return sorted(keywords, key=lambda x: x[1], reverse=True)
108
-
109
-
110
- # Find the most similar topics for the input text using the BERTopic model
111
- def output_unsupervised(text, n):
112
- new_review = text
113
- similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
114
- similar_topics = sorted(similar_topics)
115
- for i in range(n):
116
- tags = bertopic_model.get_topic(similar_topics[i])
117
- tags = [tag[0] for tag in tags]
118
- tagger_component(f"Tags from cluster {i+1}:", tags, color_name="red")
119
-
120
-
121
- # Display the supervised model page of the app
122
- def supervised_page():
123
- st.header("Supervised Model")
124
- text = st.text_area("Enter text to assign tags", height=200, key="supervised_text")
125
- text = clean_text(text)
126
- if st.button("Assign tags", key="supervised_button"):
127
- tags = tag_cnn_model(text)[0]
128
- tagger_component("Tags:", tags, color_name="red")
129
-
130
-
131
- # Display the unsupervised model using bertopic page of the app
132
- def unsupervised_page_bertopic():
133
- st.header("Unsupervised Model Using BERTopic Model")
134
- text = st.text_area(
135
- "Enter text to assign tags", height=200, key="unsupervised_text_bertopic"
136
- )
137
- text = clean_text(text)
138
- n = st.number_input(
139
- "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
140
- )
141
- if st.button("Assign tags", key="unsupervised_button_bertopic"):
142
- output_unsupervised(text, n)
143
-
144
-
145
- # Display the unsupervised model using keybert page of the app
146
- def unsupervised_page_keybert():
147
- st.header("Unsupervised Model Using KeyBERT Model")
148
- text = st.text_area(
149
- "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
150
- )
151
- text = clean_text(text)
152
- n = st.number_input(
153
- "Enter number of tags to assign", value=10, key="unsupervised_n_keybert"
154
- )
155
- ngram_range_lower = st.number_input(
156
- "Enter lower limit of ngram range",
157
- value=1,
158
- min_value=1,
159
- max_value=6,
160
- key="unsupervised_ngram_lower",
161
- )
162
- ngram_range_upper = st.number_input(
163
- "Enter upper limit of ngram range",
164
- value=3,
165
- min_value=1,
166
- max_value=6,
167
- key="unsupervised_ngram_upper",
168
- )
169
- ngram_range = (ngram_range_lower, ngram_range_upper)
170
- if st.button("Assign tags", key="unsupervised_button_keybert"):
171
- topics = retrieve_keyphrases(text, n, ngram_range)
172
- topics = [topic[0] for topic in topics]
173
- tagger_component("Tags:", topics, color_name="red")
174
-
175
-
176
- # Display the model page of the app
177
- def model_page():
178
- st.title("Select a model to use:")
179
- tab1, tab2, tab3 = st.tabs(
180
- ["Supervised Using CNN", "Unsupervised-BERTopic", "Unsupervised-KeyBERT"]
181
- )
182
- with tab1:
183
- supervised_page()
184
- with tab2:
185
- unsupervised_page_bertopic()
186
- with tab3:
187
- unsupervised_page_keybert()
188
-
189
-
190
- if __name__ == "__main__":
191
- model_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/3_πŸ‘‹_About.py DELETED
@@ -1,28 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- # Display the about page of the app with information about the creator, code, and data
5
- def about_page():
6
- st.header("About")
7
- st.write(
8
- "This app was created by [Harshit Singh](https://harsh502s.github.io), Poorvi Singh and Samruddhi Raskar as a part of their MSc Data Science 3rd semester project."
9
- )
10
- st.write("The code for this app can be found [here]( ).")
11
- st.write(
12
- "The data on which these models are trained can be found [here](https://www.kaggle.com/datasets/harsh502s/stackexchange-tag-dataset)."
13
- )
14
- st.subheader("Models used in this app are:")
15
- st.write(
16
- "1. [BERTopic](https://maartengr.github.io/BERTopic/api/bertopic.html#:~:text=BERTopic%20is%20a%20topic%20modeling,words%20in%20the%20topic%20descriptions.)"
17
- )
18
- st.write(
19
- "2. [KeyBERT](https://maartengr.github.io/KeyBERT/#:~:text=KeyBERT%20is%20a%20minimal%20and,most%20similar%20to%20a%20document.)"
20
- )
21
- st.write(
22
- "3. [CNN](https://www.tensorflow.org/tutorials/text/text_classification_rnn)"
23
- )
24
- pass
25
-
26
-
27
- if __name__ == "__main__":
28
- about_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,160 +0,0 @@
1
- absl-py==2.0.0
2
- altair==5.1.2
3
- asttokens==2.4.0
4
- astunparse==1.6.3
5
- attrs==23.1.0
6
- backcall==0.2.0
7
- beautifulsoup4==4.12.2
8
- bertopic==0.15.0
9
- blinker==1.6.3
10
- cachetools==5.3.1
11
- certifi==2023.7.22
12
- charset-normalizer==3.3.0
13
- click==8.1.7
14
- colorama==0.4.6
15
- comm==0.1.4
16
- contourpy==1.1.1
17
- cycler==0.12.1
18
- Cython==0.29.36
19
- debugpy==1.8.0
20
- decorator==5.1.1
21
- entrypoints==0.4
22
- exceptiongroup==1.1.3
23
- executing==2.0.0
24
- Faker==19.10.0
25
- fastjsonschema==2.18.1
26
- favicon==0.7.0
27
- filelock==3.12.4
28
- flatbuffers==23.5.26
29
- fonttools==4.43.1
30
- fsspec==2023.9.2
31
- gast==0.5.4
32
- gitdb==4.0.10
33
- GitPython==3.1.37
34
- google-auth==2.23.3
35
- google-auth-oauthlib==1.0.0
36
- google-pasta==0.2.0
37
- grpcio==1.59.0
38
- h5py==3.10.0
39
- hdbscan==0.8.33
40
- htbuilder==0.6.2
41
- huggingface-hub==0.17.3
42
- idna==3.4
43
- importlib-metadata==6.8.0
44
- ipykernel==6.25.2
45
- ipython==8.16.1
46
- jedi==0.19.1
47
- Jinja2==3.1.2
48
- joblib==1.3.2
49
- jsonschema==4.19.1
50
- jsonschema-specifications==2023.7.1
51
- jupyter_client==8.3.1
52
- jupyter_core==5.3.2
53
- keras==2.14.0
54
- keybert==0.8.3
55
- kiwisolver==1.4.5
56
- libclang==16.0.6
57
- llvmlite==0.41.0
58
- lxml==4.9.3
59
- Markdown==3.5
60
- markdown-it-py==3.0.0
61
- markdownlit==0.0.7
62
- MarkupSafe==2.1.3
63
- matplotlib==3.8.0
64
- matplotlib-inline==0.1.6
65
- mdurl==0.1.2
66
- ml-dtypes==0.2.0
67
- more-itertools==10.1.0
68
- mpmath==1.3.0
69
- nbformat==5.9.2
70
- nest-asyncio==1.5.8
71
- networkx==3.1
72
- nltk==3.8.1
73
- numba==0.58.0
74
- numpy==1.25.2
75
- oauthlib==3.2.2
76
- opt-einsum==3.3.0
77
- packaging==23.2
78
- pandas==2.1.1
79
- parso==0.8.3
80
- pickleshare==0.7.5
81
- Pillow==10.0.1
82
- platformdirs==3.11.0
83
- plotly==5.17.0
84
- prompt-toolkit==3.0.39
85
- protobuf==4.24.4
86
- psutil==5.9.5
87
- pure-eval==0.2.2
88
- pyarrow==13.0.0
89
- pyasn1==0.5.0
90
- pyasn1-modules==0.3.0
91
- pydeck==0.8.1b0
92
- Pygments==2.16.1
93
- pymdown-extensions==10.3
94
- pynndescent==0.5.10
95
- pyparsing==3.1.1
96
- python-dateutil==2.8.2
97
- pytz==2023.3.post1
98
- pywin32==306
99
- PyYAML==6.0.1
100
- pyzmq==25.1.1
101
- referencing==0.30.2
102
- regex==2023.10.3
103
- requests==2.31.0
104
- requests-oauthlib==1.3.1
105
- rich==13.6.0
106
- rpds-py==0.10.4
107
- rsa==4.9
108
- safetensors==0.4.0
109
- scikit-learn==1.2.2
110
- scipy==1.11.3
111
- seaborn==0.13.0
112
- sentence-transformers==2.2.2
113
- sentencepiece==0.1.99
114
- six==1.16.0
115
- smmap==5.0.1
116
- soupsieve==2.5
117
- st-annotated-text==4.0.1
118
- st-pages==0.4.5
119
- stack-data==0.6.3
120
- streamlit==1.27.2
121
- streamlit-camera-input-live==0.2.0
122
- streamlit-card==0.0.61
123
- streamlit-embedcode==0.1.2
124
- streamlit-extras==0.3.4
125
- streamlit-faker==0.0.2
126
- streamlit-image-coordinates==0.1.6
127
- streamlit-keyup==0.2.0
128
- streamlit-tags==1.2.8
129
- streamlit-toggle-switch==1.0.2
130
- streamlit-vertical-slider==1.0.2
131
- sympy==1.12
132
- tenacity==8.2.3
133
- tensorboard==2.14.1
134
- tensorboard-data-server==0.7.1
135
- tensorflow==2.14.0
136
- tensorflow-estimator==2.14.0
137
- tensorflow-intel==2.14.0
138
- tensorflow-io-gcs-filesystem==0.31.0
139
- termcolor==2.3.0
140
- threadpoolctl==3.2.0
141
- tokenizers==0.14.1
142
- toml==0.10.2
143
- toolz==0.12.0
144
- torch==2.1.0
145
- torchvision==0.16.0
146
- tornado==6.3.3
147
- tqdm==4.66.1
148
- traitlets==5.11.2
149
- transformers==4.34.0
150
- typing_extensions==4.8.0
151
- tzdata==2023.3
152
- tzlocal==5.1
153
- umap-learn==0.5.4
154
- urllib3==2.0.6
155
- validators==0.22.0
156
- watchdog==3.0.0
157
- wcwidth==0.2.8
158
- Werkzeug==3.0.0
159
- wrapt==1.14.1
160
- zipp==3.17.0