Spaces:

ashishraics
/

MCQ-Generator

Runtime error

App Files Files Community

ashishraics commited on May 18, 2022

Commit

4a6b1d2

•

1 Parent(s): e2a29e2

1st app

Browse files

Files changed (15) hide show

.gitignore +1 -0
.idea/.gitignore +3 -0
.idea/MCQ-Generator.iml +8 -0
.idea/inspectionProfiles/Project_Default.xml +37 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.streamlit/config.toml +34 -0
app.py +145 -0
config.yaml +4 -0
getModel.py +22 -0
hf_space1.png +0 -0
keywords.py +47 -0
test.py +5 -0

.gitignore CHANGED Viewed

	@@ -0,0 +1 @@


1	+ venv/

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/MCQ-Generator.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,37 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="en-core-web-sm" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E302" />
+          <option value="E128" />
+          <option value="E303" />
+          <option value="E265" />
+          <option value="E305" />
+          <option value="E266" />
+          <option value="E501" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="NLP.app.*" />
+          <option value="streamlit" />
+          <option value="pandas" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/MCQ-Generator.iml" filepath="$PROJECT_DIR$/.idea/MCQ-Generator.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[theme]
+#theme primary
+base="light"
+# Primary accent color for interactive elements.
+#primaryColor =
+# Background color for the main content area.
+#backgroundColor =
+# Background color used for the sidebar and most interactive widgets.
+#secondaryBackgroundColor ='grey'
+# Color used for almost all text.
+#textColor ='blue'
+# Font family for all text in the app, except code blocks. One of "sans serif", "serif", or "monospace".
+# Default: "sans serif"
+font = "sans serif"
+# [logger]
+# level='info'
+# messageFormat = "%(message)s"
+#messageFormat="%(asctime)s %(message)s"
+[global]
+# By default, Streamlit checks if the Python watchdog module is available and, if not, prints a warning asking for you to install it. The watchdog module is not required, but highly recommended. It improves Streamlit's ability to detect changes to files in your filesystem.
+# If you'd like to turn off this warning, set this to True.
+# Default: false
+disableWatchdogWarning = false
+# If True, will show a warning when you run a Streamlit-enabled script via "python my_script.py".
+# Default: true
+showWarningOnDirectExecution = false

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import streamlit as st
+from transformers import AutoTokenizer
+from fastT5 import OnnxT5,get_onnx_runtime_sessions
+from annotated_text import annotated_text
+import nltk
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt')
+from nltk.corpus import stopwords,wordnet
+from nltk.tokenize import sent_tokenize
+from flashtext import KeywordProcessor
+import regex as re
+import string
+import subprocess
+from PIL import Image
+import logging
+import multiprocessing
+total_threads=multiprocessing.cpu_count()
+try:
+    import pke
+    logging.error("importing pke info")
+except:
+    logging.error("installing pke info")
+    subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git'])
+    subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
+    import pke
+st.set_page_config(  # Alternate names: setup_page, page, layout
+    layout="wide",  # Can be "centered" or "wide". In the future also "dashboard", etc.
+    initial_sidebar_state="auto",  # Can be "auto", "expanded", "collapsed"
+    page_title='None',  # String or None. Strings get appended with "• Streamlit".
+)
+def set_page_title(title):
+    st.sidebar.markdown(unsafe_allow_html=True, body=f"""
+        <iframe height=0 srcdoc="<script>
+            const title = window.parent.document.querySelector('title') \
+            const oldObserver = window.parent.titleObserver
+            if (oldObserver) {{
+                oldObserver.disconnect()
+            }} \
+            const newObserver = new MutationObserver(function(mutations) {{
+                const target = mutations[0].target
+                if (target.text !== '{title}') {{
+                    target.text = '{title}'
+                }}
+            }}) \
+            newObserver.observe(title, {{ childList: true }})
+            window.parent.titleObserver = newObserver \
+            title.text = '{title}'
+        </script>" />
+    """)
+set_page_title('MCQ Generator')
+import yaml
+def read_yaml(file_path):
+    with open(file_path, "r") as f:
+        return yaml.safe_load(f)
+config = read_yaml('config.yaml')
+t5_chkpt=config['t5_normal']['chkpt']
+t5_model_path= config['t5_normal']['model_path']
+t5_tokenizer= config['t5_normal']['tokenizer']
+model_path_quanitzed=(f'{t5_model_path}/{t5_chkpt.split("/")[1]}-encoder-quantized.onnx',
+                      f'{t5_model_path}/{t5_chkpt.split("/")[1]}-decoder-quantized.onnx',
+                      f'{t5_model_path}/{t5_chkpt.split("/")[1]}-init-decoder-quantized.onnx'
+                      )
+model_session=get_onnx_runtime_sessions(model_paths=model_path_quanitzed,n_threads=1,parallel_exe_mode=True)
+model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
+tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
+def create_question_t5(model,tokenizer,context,answer,max_length=64):
+    input = "context: %s answer: %s " % (context, answer)
+    features=tokenizer([input],return_tensors='pt')
+    output=model.generate(input_ids=features['input_ids'],
+                          attention_mask=features['attention_mask'],
+                          max_length=max_length,
+                          num_beams=3)
+    return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
+def create_answers_t5(model,tokenizer,context,question,max_length=128):
+    input = "context: %s question: %s " % (context, question)
+    features=tokenizer([input],return_tensors='pt')
+    output=model.generate(input_ids=features['input_ids'],
+                          attention_mask=features['attention_mask'],
+                          max_length=max_length,
+                          num_beams=3)
+    return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
+default_context = """Another important distinction is between companies that build enterprise products (B2B - business to business) and companies that build customer products (B2C - business to consumer).
+B2B companies build products for organizations. Examples of enterprise products are Customer relationship management (CRM) software, project management tools, database management systems, cloud hosting services, etc.
+B2C companies build products for individuals. Examples of consumer products are social networks, search engines, ride-sharing services, health trackers, etc.
+Many companies do both -- their products can be used by individuals but they also offer plans for enterprise users. For example, Google Drive can be used by anyone but they also have Google Drive for Enterprise.
+Even if a B2C company doesn’t create products for enterprises directly, they might still need to sell to enterprises. For example, Facebook’s main product is used by individuals but they sell ads to enterprises. Some might argue that this makes Facebook users products, as famously quipped: “If you’re not paying for it, you’re not the customer; you’re the product being sold.”"""
+default_answer = "companies"
+input_context = st.text_area(label='Input paragraph', height=300, max_chars=1000, value=default_context)
+c1,c2,c3=st.columns(3)
+with c1:
+    create_usingkeyword = st.button("Create Questions using Keywords")
+    if create_usingkeyword:
+        from keywords import tokenize_sentence,get_noun_adj_verb
+        tokenized_sent = tokenize_sentence(input_context)
+        keywords_noun_adj_verb = get_noun_adj_verb(input_context)
+        t5_questions=[]
+        with st.spinner("Creating Questionsssss"):
+            for k in keywords_noun_adj_verb:
+                question = create_question_t5(model=model_t5,
+                                              tokenizer=tokenizer_t5,
+                                              context=input_context,
+                                              answer=k)
+                t5_questions.append(question.split('question:')[1])
+        for i,(quest,ans) in enumerate(zip(t5_questions,keywords_noun_adj_verb)):
+            st.write(f"{i + 1}: {quest}")
+            annotated_text("Answer is ", (ans, '', "#fea"))
+            st.markdown("---")
+with c2:
+    create_usingtopics = st.button("Create Questions using Topic Modelling")
+    if create_usingtopics:
+        pass

config.yaml CHANGED Viewed

	@@ -0,0 +1,4 @@

+t5_normal:
+  chkpt: "mrm8488/t5-base-finetuned-question-generation-ap"
+  model_path: "onnx_t5"
+  tokenizer: "tokenizer_t5"

getModel.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from fastT5 import generate_onnx_representation,quantize
+import os
+chkpt="mrm8488/t5-base-finetuned-question-generation-ap"
+model_path= 'onnx_t5'
+# # Step 1. convert huggingfaces t5 model to onnx
+# onnx_model_paths = generate_onnx_representation(chkpt,output_path=model_path)
+#
+# # Step 2. (recommended) quantize the converted model for fast inference and to reduce model size.
+# quant_model_paths = quantize(model_path)
+#delete non-quantized models to save space
+try:
+    os.remove(f'{model_path}/{chkpt.split("/")[1]}-encoder.onnx')
+    os.remove(f'{model_path}/{chkpt.split("/")[1]}-decoder.onnx')
+    os.remove(f'{model_path}/{chkpt.split("/")[1]}-init-decoder.onnx')
+except:
+    pass
+#

hf_space1.png ADDED Viewed

keywords.py CHANGED Viewed

	@@ -0,0 +1,47 @@

+import nltk
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt')
+from nltk.corpus import stopwords,wordnet
+from nltk.tokenize import sent_tokenize
+from flashtext import KeywordProcessor
+import regex as re
+import string
+import subprocess
+import logging
+try:
+    import pke
+    logging.error("importing pke info")
+except:
+    logging.error("installing pke info")
+    subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git'])
+    subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
+    import pke
+def tokenize_sentence(text):
+    sentences=sent_tokenize(text)
+    sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
+    return sentences
+def get_noun_adj_verb(text):
+    output = []
+    try:
+        extractor = pke.unsupervised.MultipartiteRank()
+        extractor.load_document(input=text, language='en',normalization=None)
+        # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
+        extractor.candidate_selection(pos={'NOUN', 'VERB', 'ADJ'})
+        # candidate weighting,
+        extractor.candidate_weighting(threshold=0.9,method='average',alpha=1.1)
+        #extract top n
+        keyphrases = extractor.get_n_best(n=5)
+        for val in keyphrases:
+            output.append(val[0])
+    except Exception as e:
+        print("found exception",e)
+    return list(set(output))

test.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers.onnx.features import FeaturesManager
+import transformers
+feat = list(FeaturesManager.get_supported_features_for_model_type("T5").keys())
+print(feat)