ashishraics commited on
Commit
4a6b1d2
1 Parent(s): e2a29e2
.gitignore CHANGED
@@ -0,0 +1 @@
 
 
1
+ venv/
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/MCQ-Generator.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyInterpreterInspection" enabled="false" level="WARNING" enabled_by_default="false" />
5
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
6
+ <option name="ignoredPackages">
7
+ <value>
8
+ <list size="1">
9
+ <item index="0" class="java.lang.String" itemvalue="en-core-web-sm" />
10
+ </list>
11
+ </value>
12
+ </option>
13
+ </inspection_tool>
14
+ <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
15
+ <option name="ignoredErrors">
16
+ <list>
17
+ <option value="E302" />
18
+ <option value="E128" />
19
+ <option value="E303" />
20
+ <option value="E265" />
21
+ <option value="E305" />
22
+ <option value="E266" />
23
+ <option value="E501" />
24
+ </list>
25
+ </option>
26
+ </inspection_tool>
27
+ <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
28
+ <option name="ignoredIdentifiers">
29
+ <list>
30
+ <option value="NLP.app.*" />
31
+ <option value="streamlit" />
32
+ <option value="pandas" />
33
+ </list>
34
+ </option>
35
+ </inspection_tool>
36
+ </profile>
37
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/MCQ-Generator.iml" filepath="$PROJECT_DIR$/.idea/MCQ-Generator.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
.streamlit/config.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ #theme primary
3
+ base="light"
4
+ # Primary accent color for interactive elements.
5
+ #primaryColor =
6
+
7
+ # Background color for the main content area.
8
+ #backgroundColor =
9
+
10
+ # Background color used for the sidebar and most interactive widgets.
11
+ #secondaryBackgroundColor ='grey'
12
+
13
+ # Color used for almost all text.
14
+ #textColor ='blue'
15
+
16
+ # Font family for all text in the app, except code blocks. One of "sans serif", "serif", or "monospace".
17
+ # Default: "sans serif"
18
+ font = "sans serif"
19
+
20
+ # [logger]
21
+ # level='info'
22
+ # messageFormat = "%(message)s"
23
+ #messageFormat="%(asctime)s %(message)s"
24
+
25
+ [global]
26
+
27
+ # By default, Streamlit checks if the Python watchdog module is available and, if not, prints a warning asking for you to install it. The watchdog module is not required, but highly recommended. It improves Streamlit's ability to detect changes to files in your filesystem.
28
+ # If you'd like to turn off this warning, set this to True.
29
+ # Default: false
30
+ disableWatchdogWarning = false
31
+
32
+ # If True, will show a warning when you run a Streamlit-enabled script via "python my_script.py".
33
+ # Default: true
34
+ showWarningOnDirectExecution = false
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer
3
+ from fastT5 import OnnxT5,get_onnx_runtime_sessions
4
+ from annotated_text import annotated_text
5
+ import nltk
6
+ nltk.download('stopwords')
7
+ nltk.download('wordnet')
8
+ nltk.download('punkt')
9
+ from nltk.corpus import stopwords,wordnet
10
+ from nltk.tokenize import sent_tokenize
11
+ from flashtext import KeywordProcessor
12
+ import regex as re
13
+ import string
14
+ import subprocess
15
+ from PIL import Image
16
+ import logging
17
+ import multiprocessing
18
+ total_threads=multiprocessing.cpu_count()
19
+
20
+ try:
21
+ import pke
22
+ logging.error("importing pke info")
23
+ except:
24
+ logging.error("installing pke info")
25
+ subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git'])
26
+ subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
27
+ import pke
28
+
29
+ st.set_page_config( # Alternate names: setup_page, page, layout
30
+ layout="wide", # Can be "centered" or "wide". In the future also "dashboard", etc.
31
+ initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed"
32
+ page_title='None', # String or None. Strings get appended with "• Streamlit".
33
+ )
34
+
35
+ def set_page_title(title):
36
+ st.sidebar.markdown(unsafe_allow_html=True, body=f"""
37
+ <iframe height=0 srcdoc="<script>
38
+ const title = window.parent.document.querySelector('title') \
39
+
40
+ const oldObserver = window.parent.titleObserver
41
+ if (oldObserver) {{
42
+ oldObserver.disconnect()
43
+ }} \
44
+
45
+ const newObserver = new MutationObserver(function(mutations) {{
46
+ const target = mutations[0].target
47
+ if (target.text !== '{title}') {{
48
+ target.text = '{title}'
49
+ }}
50
+ }}) \
51
+
52
+ newObserver.observe(title, {{ childList: true }})
53
+ window.parent.titleObserver = newObserver \
54
+
55
+ title.text = '{title}'
56
+ </script>" />
57
+ """)
58
+
59
+
60
+ set_page_title('MCQ Generator')
61
+
62
+ import yaml
63
+ def read_yaml(file_path):
64
+ with open(file_path, "r") as f:
65
+ return yaml.safe_load(f)
66
+
67
+ config = read_yaml('config.yaml')
68
+
69
+ t5_chkpt=config['t5_normal']['chkpt']
70
+ t5_model_path= config['t5_normal']['model_path']
71
+ t5_tokenizer= config['t5_normal']['tokenizer']
72
+
73
+ model_path_quanitzed=(f'{t5_model_path}/{t5_chkpt.split("/")[1]}-encoder-quantized.onnx',
74
+ f'{t5_model_path}/{t5_chkpt.split("/")[1]}-decoder-quantized.onnx',
75
+ f'{t5_model_path}/{t5_chkpt.split("/")[1]}-init-decoder-quantized.onnx'
76
+ )
77
+
78
+ model_session=get_onnx_runtime_sessions(model_paths=model_path_quanitzed,n_threads=1,parallel_exe_mode=True)
79
+ model_t5=OnnxT5(model_or_model_path=t5_chkpt,onnx_model_sessions=model_session)
80
+ tokenizer_t5=AutoTokenizer.from_pretrained(t5_tokenizer)
81
+
82
+ def create_question_t5(model,tokenizer,context,answer,max_length=64):
83
+ input = "context: %s answer: %s " % (context, answer)
84
+ features=tokenizer([input],return_tensors='pt')
85
+ output=model.generate(input_ids=features['input_ids'],
86
+ attention_mask=features['attention_mask'],
87
+ max_length=max_length,
88
+ num_beams=3)
89
+
90
+ return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
91
+
92
+ def create_answers_t5(model,tokenizer,context,question,max_length=128):
93
+ input = "context: %s question: %s " % (context, question)
94
+ features=tokenizer([input],return_tensors='pt')
95
+ output=model.generate(input_ids=features['input_ids'],
96
+ attention_mask=features['attention_mask'],
97
+ max_length=max_length,
98
+ num_beams=3)
99
+
100
+ return tokenizer.decode(output.squeeze(), skip_special_tokens=True)
101
+
102
+ default_context = """Another important distinction is between companies that build enterprise products (B2B - business to business) and companies that build customer products (B2C - business to consumer).
103
+
104
+ B2B companies build products for organizations. Examples of enterprise products are Customer relationship management (CRM) software, project management tools, database management systems, cloud hosting services, etc.
105
+
106
+ B2C companies build products for individuals. Examples of consumer products are social networks, search engines, ride-sharing services, health trackers, etc.
107
+
108
+ Many companies do both -- their products can be used by individuals but they also offer plans for enterprise users. For example, Google Drive can be used by anyone but they also have Google Drive for Enterprise.
109
+
110
+ Even if a B2C company doesn’t create products for enterprises directly, they might still need to sell to enterprises. For example, Facebook’s main product is used by individuals but they sell ads to enterprises. Some might argue that this makes Facebook users products, as famously quipped: “If you’re not paying for it, you’re not the customer; you’re the product being sold.”"""
111
+
112
+ default_answer = "companies"
113
+
114
+
115
+
116
+ input_context = st.text_area(label='Input paragraph', height=300, max_chars=1000, value=default_context)
117
+
118
+ c1,c2,c3=st.columns(3)
119
+
120
+ with c1:
121
+ create_usingkeyword = st.button("Create Questions using Keywords")
122
+ if create_usingkeyword:
123
+
124
+ from keywords import tokenize_sentence,get_noun_adj_verb
125
+ tokenized_sent = tokenize_sentence(input_context)
126
+ keywords_noun_adj_verb = get_noun_adj_verb(input_context)
127
+ t5_questions=[]
128
+
129
+ with st.spinner("Creating Questionsssss"):
130
+ for k in keywords_noun_adj_verb:
131
+ question = create_question_t5(model=model_t5,
132
+ tokenizer=tokenizer_t5,
133
+ context=input_context,
134
+ answer=k)
135
+ t5_questions.append(question.split('question:')[1])
136
+
137
+ for i,(quest,ans) in enumerate(zip(t5_questions,keywords_noun_adj_verb)):
138
+ st.write(f"{i + 1}: {quest}")
139
+ annotated_text("Answer is ", (ans, '', "#fea"))
140
+ st.markdown("---")
141
+
142
+ with c2:
143
+ create_usingtopics = st.button("Create Questions using Topic Modelling")
144
+ if create_usingtopics:
145
+ pass
config.yaml CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ t5_normal:
2
+ chkpt: "mrm8488/t5-base-finetuned-question-generation-ap"
3
+ model_path: "onnx_t5"
4
+ tokenizer: "tokenizer_t5"
getModel.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastT5 import generate_onnx_representation,quantize
2
+ import os
3
+
4
+ chkpt="mrm8488/t5-base-finetuned-question-generation-ap"
5
+ model_path= 'onnx_t5'
6
+
7
+ # # Step 1. convert huggingfaces t5 model to onnx
8
+ # onnx_model_paths = generate_onnx_representation(chkpt,output_path=model_path)
9
+ #
10
+ # # Step 2. (recommended) quantize the converted model for fast inference and to reduce model size.
11
+ # quant_model_paths = quantize(model_path)
12
+
13
+ #delete non-quantized models to save space
14
+ try:
15
+ os.remove(f'{model_path}/{chkpt.split("/")[1]}-encoder.onnx')
16
+ os.remove(f'{model_path}/{chkpt.split("/")[1]}-decoder.onnx')
17
+ os.remove(f'{model_path}/{chkpt.split("/")[1]}-init-decoder.onnx')
18
+ except:
19
+ pass
20
+
21
+
22
+ #
hf_space1.png ADDED
keywords.py CHANGED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('stopwords')
3
+ nltk.download('wordnet')
4
+ nltk.download('punkt')
5
+ from nltk.corpus import stopwords,wordnet
6
+ from nltk.tokenize import sent_tokenize
7
+ from flashtext import KeywordProcessor
8
+ import regex as re
9
+ import string
10
+ import subprocess
11
+ import logging
12
+
13
+ try:
14
+ import pke
15
+ logging.error("importing pke info")
16
+ except:
17
+ logging.error("installing pke info")
18
+ subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git'])
19
+ subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
20
+ import pke
21
+
22
+
23
+ def tokenize_sentence(text):
24
+ sentences=sent_tokenize(text)
25
+ sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
26
+ return sentences
27
+
28
+ def get_noun_adj_verb(text):
29
+ output = []
30
+ try:
31
+ extractor = pke.unsupervised.MultipartiteRank()
32
+ extractor.load_document(input=text, language='en',normalization=None)
33
+ # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
34
+ extractor.candidate_selection(pos={'NOUN', 'VERB', 'ADJ'})
35
+
36
+ # candidate weighting,
37
+ extractor.candidate_weighting(threshold=0.9,method='average',alpha=1.1)
38
+
39
+ #extract top n
40
+ keyphrases = extractor.get_n_best(n=5)
41
+
42
+ for val in keyphrases:
43
+ output.append(val[0])
44
+ except Exception as e:
45
+ print("found exception",e)
46
+ return list(set(output))
47
+
test.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from transformers.onnx.features import FeaturesManager
2
+ import transformers
3
+
4
+ feat = list(FeaturesManager.get_supported_features_for_model_type("T5").keys())
5
+ print(feat)