import streamlit as st from PIL import Image import ujson from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('all') # Set up the NLTK components stemmer = PorterStemmer() stop_words = stopwords.words('english') tfidf = TfidfVectorizer() # Load the data with open('publication_list_stemmed.json', 'r') as f: pub_list_first_stem = ujson.load(f) with open('publication_indexed_dictionary.json', 'r') as f: pub_index = ujson.load(f) with open('author_list_stemmed.json', 'r') as f: author_list_first_stem = ujson.load(f) with open('author_indexed_dictionary.json', 'r') as f: author_index = ujson.load(f) with open('author_names.json', 'r') as f: author_name = ujson.load(f) with open('pub_name.json', 'r') as f: pub_name = ujson.load(f) with open('pub_url.json', 'r') as f: pub_url = ujson.load(f) with open('pub_cu_author.json', 'r') as f: pub_cu_author = ujson.load(f) with open('pub_date.json', 'r') as f: pub_date = ujson.load(f) def search_data(input_text, operator_val, search_type): output_data = {} if operator_val == 2: input_text = input_text.lower().split() pointer = [] for token in input_text: if len(input_text) < 2: st.warning("Please enter at least 2 words to apply the operator.") break # if len(token) <= 3: # st.warning("Please enter more than 4 characters.") # break stem_temp = "" stem_word_file = [] temp_file = [] word_list = word_tokenize(token) for x in word_list: if x not in stop_words: stem_temp += stemmer.stem(x) + " " stem_word_file.append(stem_temp) if search_type == "publication" and pub_index.get(stem_word_file[0].strip()): pointer = pub_index.get(stem_word_file[0].strip()) elif search_type == "author" and author_index.get(stem_word_file[0].strip()): pointer = author_index.get(stem_word_file[0].strip()) if len(pointer) == 0: output_data = {} else: for j in pointer: if search_type == "publication": temp_file.append(pub_list_first_stem[j]) elif search_type == "author": temp_file.append(author_list_first_stem[j]) temp_file = tfidf.fit_transform(temp_file) cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) for j in pointer: output_data[j] = cosine_output[pointer.index(j)] else: # Relevant operator (OR) input_text = input_text.lower().split() pointer = [] match_word = [] for token in input_text: if len(input_text) < 2: st.warning("Please enter at least 2 words to apply the operator.") break # if len(token) <= 3: # st.warning("Please enter more than 4 characters.") # break temp_file = [] set2 = set() stem_word_file = [] word_list = word_tokenize(token) stem_temp = "" for x in word_list: if x not in stop_words: stem_temp += stemmer.stem(x) + " " stem_word_file.append(stem_temp) if search_type == "publication" and pub_index.get(stem_word_file[0].strip()): set1 = set(pub_index.get(stem_word_file[0].strip())) pointer.extend(list(set1)) elif search_type == "author" and author_index.get(stem_word_file[0].strip()): set1 = set(author_index.get(stem_word_file[0].strip())) pointer.extend(list(set1)) if match_word == []: match_word = list({z for z in pointer if z in set2 or (set2.add(z) or False)}) else: match_word.extend(list(set1)) match_word = list({z for z in match_word if z in set2 or (set2.add(z) or False)}) if len(input_text) > 1: match_word = {z for z in match_word if z in set2 or (set2.add(z) or False)} if len(match_word) == 0: output_data = {} else: for j in list(match_word): if search_type == "publication": temp_file.append(pub_list_first_stem[j]) elif search_type == "author": temp_file.append(author_list_first_stem[j]) temp_file = tfidf.fit_transform(temp_file) cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) for j in list(match_word): output_data[j] = cosine_output[list(match_word).index(j)] else: if len(pointer) == 0: output_data = {} else: for j in pointer: if search_type == "publication": temp_file.append(pub_list_first_stem[j]) elif search_type == "author": temp_file.append(author_list_first_stem[j]) temp_file = tfidf.fit_transform(temp_file) cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) for j in pointer: output_data[j] = cosine_output[pointer.index(j)] return output_data def app(): # Load the image and display it image = Image.open('Understanding-the-Financial-Aid-Process-Banner.jpg') st.image(image) # Add a text description st.markdown("

Uncover the brilliance: Explore profiles, groundbreaking work, and cutting-edge research by the exceptional minds of Fordham University.

", unsafe_allow_html=True) input_text = st.text_input("Search research:", key="query_input") operator_val = st.radio( "Search Filters", ['Exact', 'Relevant'], index=1, key="operator_input", horizontal=True, ) search_type = st.radio( "Search in:", ['Publications', 'Authors'], index=0, key="search_type_input", horizontal=True, ) if st.button("SEARCH"): if search_type == "Publications": output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "publication") elif search_type == "Authors": output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "author") else: output_data = {} # Display the search results show_results(output_data, search_type) st.markdown("

Brought to you with by Boakye I Ababio | Data © Fordham University

", unsafe_allow_html=True) def show_results(output_data, search_type): aa = 0 rank_sorting = sorted(output_data.items(), key=lambda z: z[1], reverse=True) # Show the total number of research results st.info(f"Showing results for: {len(rank_sorting)}") # Show the cards N_cards_per_row = 3 for n_row, (id_val, ranking) in enumerate(rank_sorting): i = n_row % N_cards_per_row if i == 0: st.write("---") cols = st.columns(N_cards_per_row, gap="large") # Draw the card with cols[n_row % N_cards_per_row]: if search_type == "Publications": st.caption(f"{pub_date[id_val].strip()}") st.markdown(f"**{pub_cu_author[id_val].strip()}**") st.markdown(f"*{pub_name[id_val].strip()}*") st.markdown(f"**{pub_url[id_val]}**") elif search_type == "Authors": st.caption(f"{pub_date[id_val].strip()}") st.markdown(f"**{author_name[id_val].strip()}**") st.markdown(f"*{pub_name[id_val].strip()}*") st.markdown(f"**{pub_url[id_val]}**") st.markdown(f"Ranking: {ranking[0]:.2f}") aa += 1 if aa == 0: st.info("No results found. Please try again.") else: st.info(f"Results shown for: {aa}") if __name__ == '__main__': app()