Spaces:

Harsh502s
/

Autonomous_Text_Tagging_App

Sleeping

App Files Files Community

Harsh502s commited on Oct 15, 2023

Commit

ade606e

1 Parent(s): 94d512b

Commit

Browse files

Files changed (9) hide show

Models/bin.pkl +0 -3
Models/stackexchange_topic_model.pkl +0 -3
Models/tag_model.h5 +0 -3
Models/token.pkl +0 -3
app.py +0 -37
pages/1_📊_Topic Model Results.py +0 -41
pages/2_🤖_Models.py +0 -191
pages/3_👋_About.py +0 -28
requirements.txt +0 -160

Models/bin.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e0a4fbec22e1a6e06396e8b1c384d5d541b6c0dfd2cec61a8c9a4f7e1179db0c
-size 756

Models/stackexchange_topic_model.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:81f72e9da968496087c2dbe77cc06e1937789099c7b69380e9cebd5ab0a357f8
-size 438242069

Models/tag_model.h5 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8b304a86faeda3cd1ff63af09e57bec8ba0c98d5bdb30613e7fcb08ee1f57b9c
-size 77937800

Models/token.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:72c3d62f1cc157956c2131619934ced13e82052e23fa4efe60f104a6632d2a5c
-size 1961509

app.py DELETED Viewed

@@ -1,37 +0,0 @@
-import streamlit as st
-from st_pages import Page, show_pages
-# should be
-show_pages(
-    [
-        Page(r"app.py", "Home", "🏠"),
-        Page(r"Pages/1_📊_Topic Model Results.py", 'Topic Model Result',"📊"),
-        Page(r"Pages/2_🤖_Models.py", "Models", "🤖"),
-        Page(r"Pages/3_👋_About.py", "About", "👋"),
-    ]
-)
-st.set_page_config(
-    page_title="Autonomous Text Tagging App",
-    page_icon="📝",
-    layout="wide",
-    initial_sidebar_state="expanded",
-)
-# Display the main page of the app with instructions on how to use it
-def main():
-    st.title("Autonomous Text Tagging App")
-    st.subheader(
-        "This application shows a demo of different supervised and unsupervised approches taken in the field of NLP to give relevant tags to the text."
-    )
-    st.subheader("This is a multi-page app.")
-    st.write("1. You can navigate between pages by clicking on the sidebar.")
-    st.write("2. The Topic Modeling Results page shows the results of BERTopic.")
-    st.write("3. The Model page give a demo of all the models used in this app.")
-    st.write("4. The About page gives information about the creator, code, and data.")
-    st.divider()
-if __name__ == "__main__":
-    main()

pages/1_📊_Topic Model Results.py DELETED Viewed

@@ -1,41 +0,0 @@
-import streamlit as st
-from bertopic import BERTopic
-@st.cache_resource
-def load_model():
-    return BERTopic.load(r"Models/stackexchange_topic_model.pkl")
-bertopic_model = load_model()
-def topic_model_results():
-    st.title("Topic Model Results")
-    tab1, tab2, tab3, tab4, tab5 = st.tabs(
-        [
-            "Topic Word Score",
-            "Intertopic Distance Map",
-            "Topic Probability Distribution",
-            "Visualize Hierarchical Topics",
-            "Visualize Topics Heatmap",
-        ]
-    )
-    with tab1:
-        st.write(bertopic_model.visualize_barchart(top_n_topics=20))
-    with tab2:
-        st.write(bertopic_model.visualize_topics())
-    with tab3:
-        st.write(
-            bertopic_model.visualize_distribution(
-                bertopic_model.probabilities_[0], min_probability=0.015
-            )
-        )
-    with tab4:
-        st.write(bertopic_model.visualize_hierarchy())
-    with tab5:
-        st.write(bertopic_model.visualize_heatmap())
-if __name__ == "__main__":
-    topic_model_results()

pages/2_🤖_Models.py DELETED Viewed

@@ -1,191 +0,0 @@
-import streamlit as st
-from streamlit_extras.tags import tagger_component
-import re
-import pickle
-from keybert import KeyBERT
-from bertopic import BERTopic
-from keras.models import load_model
-from keras.preprocessing.sequence import pad_sequences
-# Load the BERTopic model
-@st.cache_resource
-def load_models():
-    return (
-        BERTopic.load(r"Models/stackexchange_topic_model.pkl"),
-        KeyBERT("all-MiniLM-L6-v2"),
-        load_model(r"Models/tag_model.h5"),
-        pickle.load(open(r"Models/token.pkl", "rb")),
-        pickle.load(open(r"Models/bin.pkl", "rb")),
-    )
-# Load the model into memory
-bertopic_model, keybert_model, cnn_model, tokenizer, binarizer = load_models()
-# Clean the input text
-def clean_text(text):
-    text = re.sub(r"<.*?>", "", text)
-    text = re.sub(r"[^A-Za-z']", " ", text)
-    text = re.sub(r"\s+", " ", text)
-    return text
-# Assign tags to the input text using the CNN model
-def tag_cnn_model(text):
-    text = clean_text(text)
-    text = tokenizer.texts_to_sequences([text])
-    text_padded = pad_sequences(text, maxlen=512)
-    q_pred = cnn_model.predict(text_padded)
-    q_pred = (q_pred >= 0.25).astype(int)
-    return binarizer.inverse_transform(q_pred)
-# Retrieve the keyphrases from the input text using the KeyBERT model
-def retrieve_keyphrases(text, n, ngram_range):
-    keywords = keybert_model.extract_keywords(
-        text,
-        keyphrase_ngram_range=ngram_range,
-        top_n=n,
-        diversity=0.5,
-        use_maxsum=True,
-        use_mmr=True,
-        seed_keywords=[
-            "machine-learning",
-            "r",
-            "regression",
-            "deep-learning",
-            "neural-networks",
-            "data-request",
-            "python",
-            "reinforcement-learning",
-            "classification",
-            "time-series",
-            "probability",
-            "neural-network",
-            "distributions",
-            "bayesian",
-            "hypothesis-testing",
-            "keras",
-            "mathematical-statistics",
-            "scikit-learn",
-            "logistic",
-            "convolutional-neural-networks",
-            "clustering",
-            "tensorflow",
-            "terminology",
-            "nlp",
-            "correlation",
-            "self-study",
-            "normal-distribution",
-            "geospatial",
-            "cross-validation",
-            "optimization",
-            "random-forest",
-            "mixed-model",
-            "data-mining",
-            "feature-selection",
-            "pca",
-            "references",
-            "computer-vision",
-            "data-visualization",
-            "confidence-interval",
-            "generalized-linear-model",
-            "variance",
-            "natural-language-processing",
-            "dataset",
-            "svm",
-            "training",
-            "maximum-likelihood",
-            "statistical-significance",
-            "gradient-descent",
-            "multiple-regression",
-            "estimation",
-        ],
-    )
-    return sorted(keywords, key=lambda x: x[1], reverse=True)
-# Find the most similar topics for the input text using the BERTopic model
-def output_unsupervised(text, n):
-    new_review = text
-    similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
-    similar_topics = sorted(similar_topics)
-    for i in range(n):
-        tags = bertopic_model.get_topic(similar_topics[i])
-        tags = [tag[0] for tag in tags]
-        tagger_component(f"Tags from cluster {i+1}:", tags, color_name="red")
-# Display the supervised model page of the app
-def supervised_page():
-    st.header("Supervised Model")
-    text = st.text_area("Enter text to assign tags", height=200, key="supervised_text")
-    text = clean_text(text)
-    if st.button("Assign tags", key="supervised_button"):
-        tags = tag_cnn_model(text)[0]
-        tagger_component("Tags:", tags, color_name="red")
-# Display the unsupervised model using bertopic page of the app
-def unsupervised_page_bertopic():
-    st.header("Unsupervised Model Using BERTopic Model")
-    text = st.text_area(
-        "Enter text to assign tags", height=200, key="unsupervised_text_bertopic"
-    )
-    text = clean_text(text)
-    n = st.number_input(
-        "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
-    )
-    if st.button("Assign tags", key="unsupervised_button_bertopic"):
-        output_unsupervised(text, n)
-# Display the unsupervised model using keybert page of the app
-def unsupervised_page_keybert():
-    st.header("Unsupervised Model Using KeyBERT Model")
-    text = st.text_area(
-        "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
-    )
-    text = clean_text(text)
-    n = st.number_input(
-        "Enter number of tags to assign", value=10, key="unsupervised_n_keybert"
-    )
-    ngram_range_lower = st.number_input(
-        "Enter lower limit of ngram range",
-        value=1,
-        min_value=1,
-        max_value=6,
-        key="unsupervised_ngram_lower",
-    )
-    ngram_range_upper = st.number_input(
-        "Enter upper limit of ngram range",
-        value=3,
-        min_value=1,
-        max_value=6,
-        key="unsupervised_ngram_upper",
-    )
-    ngram_range = (ngram_range_lower, ngram_range_upper)
-    if st.button("Assign tags", key="unsupervised_button_keybert"):
-        topics = retrieve_keyphrases(text, n, ngram_range)
-        topics = [topic[0] for topic in topics]
-        tagger_component("Tags:", topics, color_name="red")
-# Display the model page of the app
-def model_page():
-    st.title("Select a model to use:")
-    tab1, tab2, tab3 = st.tabs(
-        ["Supervised Using CNN", "Unsupervised-BERTopic", "Unsupervised-KeyBERT"]
-    )
-    with tab1:
-        supervised_page()
-    with tab2:
-        unsupervised_page_bertopic()
-    with tab3:
-        unsupervised_page_keybert()
-if __name__ == "__main__":
-    model_page()

pages/3_👋_About.py DELETED Viewed

@@ -1,28 +0,0 @@
-import streamlit as st
-# Display the about page of the app with information about the creator, code, and data
-def about_page():
-    st.header("About")
-    st.write(
-        "This app was created by [Harshit Singh](https://harsh502s.github.io), Poorvi Singh and Samruddhi Raskar as a part of their MSc Data Science 3rd semester project."
-    )
-    st.write("The code for this app can be found [here]( ).")
-    st.write(
-        "The data on which these models are trained can be found [here](https://www.kaggle.com/datasets/harsh502s/stackexchange-tag-dataset)."
-    )
-    st.subheader("Models used in this app are:")
-    st.write(
-        "1. [BERTopic](https://maartengr.github.io/BERTopic/api/bertopic.html#:~:text=BERTopic%20is%20a%20topic%20modeling,words%20in%20the%20topic%20descriptions.)"
-    )
-    st.write(
-        "2. [KeyBERT](https://maartengr.github.io/KeyBERT/#:~:text=KeyBERT%20is%20a%20minimal%20and,most%20similar%20to%20a%20document.)"
-    )
-    st.write(
-        "3. [CNN](https://www.tensorflow.org/tutorials/text/text_classification_rnn)"
-    )
-    pass
-if __name__ == "__main__":
-    about_page()

requirements.txt DELETED Viewed

@@ -1,160 +0,0 @@
-absl-py==2.0.0
-altair==5.1.2
-asttokens==2.4.0
-astunparse==1.6.3
-attrs==23.1.0
-backcall==0.2.0
-beautifulsoup4==4.12.2
-bertopic==0.15.0
-blinker==1.6.3
-cachetools==5.3.1
-certifi==2023.7.22
-charset-normalizer==3.3.0
-click==8.1.7
-colorama==0.4.6
-comm==0.1.4
-contourpy==1.1.1
-cycler==0.12.1
-Cython==0.29.36
-debugpy==1.8.0
-decorator==5.1.1
-entrypoints==0.4
-exceptiongroup==1.1.3
-executing==2.0.0
-Faker==19.10.0
-fastjsonschema==2.18.1
-favicon==0.7.0
-filelock==3.12.4
-flatbuffers==23.5.26
-fonttools==4.43.1
-fsspec==2023.9.2
-gast==0.5.4
-gitdb==4.0.10
-GitPython==3.1.37
-google-auth==2.23.3
-google-auth-oauthlib==1.0.0
-google-pasta==0.2.0
-grpcio==1.59.0
-h5py==3.10.0
-hdbscan==0.8.33
-htbuilder==0.6.2
-huggingface-hub==0.17.3
-idna==3.4
-importlib-metadata==6.8.0
-ipykernel==6.25.2
-ipython==8.16.1
-jedi==0.19.1
-Jinja2==3.1.2
-joblib==1.3.2
-jsonschema==4.19.1
-jsonschema-specifications==2023.7.1
-jupyter_client==8.3.1
-jupyter_core==5.3.2
-keras==2.14.0
-keybert==0.8.3
-kiwisolver==1.4.5
-libclang==16.0.6
-llvmlite==0.41.0
-lxml==4.9.3
-Markdown==3.5
-markdown-it-py==3.0.0
-markdownlit==0.0.7
-MarkupSafe==2.1.3
-matplotlib==3.8.0
-matplotlib-inline==0.1.6
-mdurl==0.1.2
-ml-dtypes==0.2.0
-more-itertools==10.1.0
-mpmath==1.3.0
-nbformat==5.9.2
-nest-asyncio==1.5.8
-networkx==3.1
-nltk==3.8.1
-numba==0.58.0
-numpy==1.25.2
-oauthlib==3.2.2
-opt-einsum==3.3.0
-packaging==23.2
-pandas==2.1.1
-parso==0.8.3
-pickleshare==0.7.5
-Pillow==10.0.1
-platformdirs==3.11.0
-plotly==5.17.0
-prompt-toolkit==3.0.39
-protobuf==4.24.4
-psutil==5.9.5
-pure-eval==0.2.2
-pyarrow==13.0.0
-pyasn1==0.5.0
-pyasn1-modules==0.3.0
-pydeck==0.8.1b0
-Pygments==2.16.1
-pymdown-extensions==10.3
-pynndescent==0.5.10
-pyparsing==3.1.1
-python-dateutil==2.8.2
-pytz==2023.3.post1
-pywin32==306
-PyYAML==6.0.1
-pyzmq==25.1.1
-referencing==0.30.2
-regex==2023.10.3
-requests==2.31.0
-requests-oauthlib==1.3.1
-rich==13.6.0
-rpds-py==0.10.4
-rsa==4.9
-safetensors==0.4.0
-scikit-learn==1.2.2
-scipy==1.11.3
-seaborn==0.13.0
-sentence-transformers==2.2.2
-sentencepiece==0.1.99
-six==1.16.0
-smmap==5.0.1
-soupsieve==2.5
-st-annotated-text==4.0.1
-st-pages==0.4.5
-stack-data==0.6.3
-streamlit==1.27.2
-streamlit-camera-input-live==0.2.0
-streamlit-card==0.0.61
-streamlit-embedcode==0.1.2
-streamlit-extras==0.3.4
-streamlit-faker==0.0.2
-streamlit-image-coordinates==0.1.6
-streamlit-keyup==0.2.0
-streamlit-tags==1.2.8
-streamlit-toggle-switch==1.0.2
-streamlit-vertical-slider==1.0.2
-sympy==1.12
-tenacity==8.2.3
-tensorboard==2.14.1
-tensorboard-data-server==0.7.1
-tensorflow==2.14.0
-tensorflow-estimator==2.14.0
-tensorflow-intel==2.14.0
-tensorflow-io-gcs-filesystem==0.31.0
-termcolor==2.3.0
-threadpoolctl==3.2.0
-tokenizers==0.14.1
-toml==0.10.2
-toolz==0.12.0
-torch==2.1.0
-torchvision==0.16.0
-tornado==6.3.3
-tqdm==4.66.1
-traitlets==5.11.2
-transformers==4.34.0
-typing_extensions==4.8.0
-tzdata==2023.3
-tzlocal==5.1
-umap-learn==0.5.4
-urllib3==2.0.6
-validators==0.22.0
-watchdog==3.0.0
-wcwidth==0.2.8
-Werkzeug==3.0.0
-wrapt==1.14.1
-zipp==3.17.0