Spaces:
Sleeping
Sleeping
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import PorterStemmer | |
import ujson | |
# Load the JSON file containing scraped results | |
with open('scraper_results.json', 'r') as doc: | |
scraper_results = doc.read() | |
# Extract author names from the JSON data | |
authors = [] | |
data_dict = ujson.loads(scraper_results) | |
for item in data_dict: | |
authors.append(item["cu_author"]) | |
# Write the author names to a JSON file | |
with open('author_names.json', 'w') as f: | |
ujson.dump(authors, f) | |
# Download necessary NLTK resources | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
nltk.download('all') | |
# Load the JSON file containing author names | |
with open('author_names.json', 'r') as f: | |
author_data = f.read() | |
# Load JSON data | |
authors = ujson.loads(author_data) | |
# Preprocess the author names | |
stop_words = stopwords.words('english') | |
stemmer = PorterStemmer() | |
authors_list_first_stem = [] | |
authors_list = [] | |
for author in authors: | |
words = word_tokenize(author) | |
stem_word = "" | |
for word in words: | |
if word.lower() not in stop_words: | |
stem_word += stemmer.stem(word) + " " | |
authors_list_first_stem.append(stem_word) | |
authors_list.append(author) | |
# Indexing process | |
data_dict = {} | |
for i in range(len(authors_list_first_stem)): | |
for word in authors_list_first_stem[i].split(): | |
if word not in data_dict: | |
data_dict[word] = [i] | |
else: | |
data_dict[word].append(i) | |
# Write the preprocessed author names and indexed dictionary to JSON files | |
with open('author_list_stemmed.json', 'w') as f: | |
ujson.dump(authors_list_first_stem, f) | |
with open('author_indexed_dictionary.json', 'w') as f: | |
ujson.dump(data_dict, f) |