taishi-i's picture
update awesome-japanese-nlp-resources-search.json
7aaa11c
import json
from datetime import datetime
import gradio as gr
import pandas as pd
def read_json(file_name):
with open(file_name, "r") as f:
json_data = json.load(f)
return json_data
def truncate_text(text, max_length=40):
if len(text) > max_length:
return text[: max_length - 1] + "…"
else:
return text
json_file = "awesome-japanese-nlp-resources-search.json"
json_data = read_json(json_file)
data = {
"project_name": [],
"downloads": [],
"stars": [],
"description": [],
"first_commit": [],
"latest_commit": [],
"source": [],
"languages": [],
"type": [],
}
for data_json in json_data:
url = data_json["url"]
description = data_json["description"].lower()
project_name = data_json["project_name"]
source = data_json["source"]
languages = data_json["languages"]
repo_type = data_json["model_or_dataset"]
first_commit = data_json["first_commit"]
if first_commit:
first_commit = datetime.strptime(first_commit, "%Y-%m-%d %H:%M:%S")
first_commit = first_commit.date()
latest_commit = data_json["latest_commit"]
if latest_commit:
latest_commit = datetime.strptime(latest_commit, "%Y-%m-%d %H:%M:%S")
latest_commit = latest_commit.date()
if "stargazers_count" in data_json:
data["stars"].append(data_json["stargazers_count"])
else:
data["stars"].append(None)
if "downloads" in data_json:
data["downloads"].append(data_json["downloads"])
else:
data["downloads"].append(None)
data["project_name"].append(f"[{truncate_text(project_name)}]({url})")
data["source"].append(source)
data["description"].append(description)
data["languages"].append(languages)
data["type"].append(repo_type)
data["first_commit"].append(first_commit)
data["latest_commit"].append(latest_commit)
data = pd.DataFrame(data)
def show_search_results(
language_filter, queries, source_checkbox, show_checkbox
):
queries = queries.lower()
queries = queries.split()
df_search = data
if language_filter:
def contains_language(language_list, filter_lang):
return filter_lang in language_list
matches = df_search["languages"].apply(
contains_language, filter_lang=language_filter
)
df_search = df_search[matches]
# source_checkbox
if "GitHub" not in source_checkbox:
df_search = df_search[df_search["source"] != "GitHub"]
df_search = df_search.drop("stars", axis=1)
if "Hugging Face" not in source_checkbox:
df_search = df_search[df_search["source"] != "Hugging Face"]
df_search = df_search.drop("downloads", axis=1)
if "Dataset" in source_checkbox:
df_search = df_search[df_search["type"] == "dataset"]
if "Model" in source_checkbox:
df_search = df_search[df_search["type"] == "model"]
# show_checkbox
if "project_name" not in show_checkbox:
df_search = df_search.drop("project_name", axis=1)
if "downloads" not in show_checkbox:
df_search = df_search.drop("downloads", axis=1)
if "stars" not in show_checkbox:
df_search = df_search.drop("stars", axis=1)
if "first_commit" not in show_checkbox:
df_search = df_search.drop("first_commit", axis=1)
if "latest_commit" not in show_checkbox:
df_search = df_search.drop("latest_commit", axis=1)
if "description" not in show_checkbox:
df_search = df_search.drop("description", axis=1)
if "source" not in show_checkbox:
df_search = df_search.drop("source", axis=1)
if "languages" not in show_checkbox:
df_search = df_search.drop("languages", axis=1)
if "type" not in show_checkbox:
df_search = df_search.drop("type", axis=1)
for query in queries:
contained_description = data["description"].str.contains(query)
contained_project_name = data["project_name"].str.contains(query)
df_search = df_search[contained_description | contained_project_name]
return df_search
with gr.Blocks() as demo:
gr.Markdown(
"""
# Awesome Japanese NLP resources search πŸ”Ž
You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources).
"""
)
query = gr.Textbox(label="Search words", placeholder="llm")
with gr.Row():
languages = [
"Python",
"Jupyter Notebook",
"Java",
"C++",
"JavaScript",
"TypeScript",
"C#",
"Rust",
"Go",
"C",
"Kotlin",
"Ruby",
"Perl",
]
language_selector = gr.Dropdown(
label="Programming Language",
choices=languages,
)
source_checkbox = gr.CheckboxGroup(
["GitHub", "Hugging Face", "Dataset", "Model"],
value=["GitHub", "Hugging Face"],
label="Source",
)
show_checkbox = gr.CheckboxGroup(
[
"project_name",
"downloads",
"stars",
"description",
"first_commit",
"latest_commit",
"source",
"type",
"languages",
],
value=[
"project_name",
"downloads",
"stars",
"description",
],
label="Display columns in a table",
)
df = gr.DataFrame(
value=data,
type="pandas",
datatype="markdown",
height=600,
)
query.change(
fn=show_search_results,
inputs=[
language_selector,
query,
source_checkbox,
show_checkbox,
],
outputs=df,
)
language_selector.change(
fn=show_search_results,
inputs=[
language_selector,
query,
source_checkbox,
show_checkbox,
],
outputs=df,
)
source_checkbox.change(
fn=show_search_results,
inputs=[
language_selector,
query,
source_checkbox,
show_checkbox,
],
outputs=df,
)
show_checkbox.change(
fn=show_search_results,
inputs=[
language_selector,
query,
source_checkbox,
show_checkbox,
],
outputs=df,
)
demo.launch()