Spaces:
Sleeping
Sleeping
import nltk #NLTK for natural language processing tasks | |
from nltk.corpus import stopwords # list of stop word | |
from nltk.tokenize import word_tokenize # To tokenize each word | |
from nltk.stem import PorterStemmer # For specific rules to transform words to their stems | |
# Preprosessing data before indexing | |
with open('scraper_results.json', 'r') as doc: scraper_results=doc.read() | |
# Initialize empty lists to store publication name, URL, author, and date | |
pubName = [] | |
pubURL = [] | |
pubCUAuthor = [] | |
pubDate = [] | |
# Load the scraped results using ujson | |
data_dict = ujson.loads(scraper_results) | |
# Get the length of the data_dict (number of publications) | |
array_length = len(data_dict) | |
# Print the number of publications | |
print(array_length) | |
#Seperate name, url, date, author in different file | |
for item in data_dict: | |
pubName.append(item["name"]) | |
pubURL.append(item["pub_url"]) | |
pubCUAuthor.append(item["cu_author"]) | |
pubDate.append(item["date"]) | |
with open('pub_name.json', 'w') as f:ujson.dump(pubName, f) | |
with open('pub_url.json', 'w') as f:ujson.dump(pubURL, f) | |
with open('pub_cu_author.json', 'w') as f:ujson.dump(pubCUAuthor, f) | |
with open('pub_date.json', 'w') as f: ujson.dump(pubDate, f) | |
#Open a file with publication names in read mode | |
with open('pub_name.json','r') as f:publication=f.read() | |
#Load JSON File | |
pubName = ujson.loads(publication) | |
#Downloading libraries to use its methods | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
#Predefined stopwords in nltk are used | |
stop_words = stopwords.words('english') | |
stemmer = PorterStemmer() | |
pub_list_first_stem = [] | |
pub_list = [] | |
pub_list_wo_sc = [] | |
print(len(pubName)) | |
for file in pubName: | |
#Splitting strings to tokens(words) | |
words = word_tokenize(file) | |
stem_word = "" | |
for i in words: | |
if i.lower() not in stop_words: | |
stem_word += stemmer.stem(i) + " " | |
pub_list_first_stem.append(stem_word) | |
pub_list.append(file) | |
#Removing all below characters | |
special_characters = '''!()-β[]{};:'"\, <>./?@#$%^&*_~0123456789+=ββ''' | |
for file in pub_list: | |
word_wo_sc = "" | |
if len(file.split()) ==1 : pub_list_wo_sc.append(file) | |
else: | |
for a in file: | |
if a in special_characters: | |
word_wo_sc += ' ' | |
else: | |
word_wo_sc += a | |
#print(word_wo_sc) | |
pub_list_wo_sc.append(word_wo_sc) | |
#Stemming Process | |
pub_list_stem_wo_sw = [] | |
for name in pub_list_wo_sc: | |
words = word_tokenize(name) | |
stem_word = "" | |
for a in words: | |
if a.lower() not in stop_words: | |
stem_word += stemmer.stem(a) + ' ' | |
pub_list_stem_wo_sw.append(stem_word.lower()) | |
data_dict = {} #Inverted Index holder | |
# Indexing process | |
for a in range(len(pub_list_stem_wo_sw)): | |
for b in pub_list_stem_wo_sw[a].split(): | |
if b not in data_dict: | |
data_dict[b] = [a] | |
else: | |
data_dict[b].append(a) | |
# printing the lenght | |
print(len(pub_list_wo_sc)) | |
print(len(pub_list_stem_wo_sw)) | |
print(len(pub_list_first_stem)) | |
print(len(pub_list)) | |
# with open('publication_list.json', 'w') as f: | |
# ujson.dump(pub_list, f) | |
with open('publication_list_stemmed.json', 'w') as f: | |
ujson.dump(pub_list_first_stem, f) | |
with open('publication_indexed_dictionary.json', 'w') as f: | |
ujson.dump(data_dict, f) | |