Spaces:
Sleeping
Sleeping
File size: 1,707 Bytes
2c4cdb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import ujson
# Load the JSON file containing scraped results
with open('scraper_results.json', 'r') as doc:
scraper_results = doc.read()
# Extract author names from the JSON data
authors = []
data_dict = ujson.loads(scraper_results)
for item in data_dict:
authors.append(item["cu_author"])
# Write the author names to a JSON file
with open('author_names.json', 'w') as f:
ujson.dump(authors, f)
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
# Load the JSON file containing author names
with open('author_names.json', 'r') as f:
author_data = f.read()
# Load JSON data
authors = ujson.loads(author_data)
# Preprocess the author names
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
authors_list_first_stem = []
authors_list = []
for author in authors:
words = word_tokenize(author)
stem_word = ""
for word in words:
if word.lower() not in stop_words:
stem_word += stemmer.stem(word) + " "
authors_list_first_stem.append(stem_word)
authors_list.append(author)
# Indexing process
data_dict = {}
for i in range(len(authors_list_first_stem)):
for word in authors_list_first_stem[i].split():
if word not in data_dict:
data_dict[word] = [i]
else:
data_dict[word].append(i)
# Write the preprocessed author names and indexed dictionary to JSON files
with open('author_list_stemmed.json', 'w') as f:
ujson.dump(authors_list_first_stem, f)
with open('author_indexed_dictionary.json', 'w') as f:
ujson.dump(data_dict, f) |