import json
import os
import pprint
import streamlit as st
import streamlit.components.v1 as components
import requests
from typing import Union
pp = pprint.PrettyPrinter(indent=2)
st.set_page_config(page_title="Gaia Search 🌖🌏", layout="wide")
os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
file.write('[theme]\nbase="light"')
corpus_name_map = {
"LAION": "laion",
"ROOTS": "roots",
"The Pile": "pile",
"C4": "c4",
}
st.sidebar.markdown(
"""
Gaia Search 🌖🌏
A search engine for large scale texual
corpora. Most of the datasets included in the tool are based on Common
Crawl. By using the tool, you are also bound by the Common Crawl terms
of use in respect of the content contained in the datasets.
"
def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str:
hit_list = []
if corpus == "roots":
result_page_html = ""
for lang, results_for_lang in hits.items():
print("Processing language", lang)
if len(results_for_lang) == 0:
result_page_html += """
No results for language: {}
""".format(
lang
)
continue
results_for_lang_html = ""
for result in results_for_lang:
result_html = format_result(result, highlight_terms)
results_for_lang_html += result_html
results_for_lang_html = f"""
Results for language: {lang}
{results_for_lang_html}
"""
result_page_html += results_for_lang_html
return result_page_html
for hit in hits:
res_head = f"""