|
import json |
|
from datetime import datetime |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
|
|
|
|
def read_json(file_name): |
|
with open(file_name, "r") as f: |
|
json_data = json.load(f) |
|
return json_data |
|
|
|
|
|
def truncate_text(text, max_length=40): |
|
if len(text) > max_length: |
|
return text[: max_length - 1] + "β¦" |
|
else: |
|
return text |
|
|
|
|
|
json_file = "awesome-japanese-nlp-resources-search.json" |
|
json_data = read_json(json_file) |
|
data = { |
|
"project_name": [], |
|
"downloads": [], |
|
"stars": [], |
|
"description": [], |
|
"first_commit": [], |
|
"latest_commit": [], |
|
"source": [], |
|
"languages": [], |
|
"type": [], |
|
} |
|
|
|
for data_json in json_data: |
|
url = data_json["url"] |
|
description = data_json["description"].lower() |
|
project_name = data_json["project_name"] |
|
source = data_json["source"] |
|
languages = data_json["languages"] |
|
repo_type = data_json["model_or_dataset"] |
|
first_commit = data_json["first_commit"] |
|
if first_commit: |
|
first_commit = datetime.strptime(first_commit, "%Y-%m-%d %H:%M:%S") |
|
first_commit = first_commit.date() |
|
|
|
latest_commit = data_json["latest_commit"] |
|
if latest_commit: |
|
latest_commit = datetime.strptime(latest_commit, "%Y-%m-%d %H:%M:%S") |
|
latest_commit = latest_commit.date() |
|
|
|
if "stargazers_count" in data_json: |
|
data["stars"].append(data_json["stargazers_count"]) |
|
else: |
|
data["stars"].append(None) |
|
|
|
if "downloads" in data_json: |
|
data["downloads"].append(data_json["downloads"]) |
|
else: |
|
data["downloads"].append(None) |
|
|
|
data["project_name"].append(f"[{truncate_text(project_name)}]({url})") |
|
data["source"].append(source) |
|
data["description"].append(description) |
|
data["languages"].append(languages) |
|
data["type"].append(repo_type) |
|
data["first_commit"].append(first_commit) |
|
data["latest_commit"].append(latest_commit) |
|
|
|
data = pd.DataFrame(data) |
|
|
|
|
|
def show_search_results( |
|
language_filter, queries, source_checkbox, show_checkbox |
|
): |
|
queries = queries.lower() |
|
queries = queries.split() |
|
|
|
df_search = data |
|
|
|
if language_filter: |
|
|
|
def contains_language(language_list, filter_lang): |
|
return filter_lang in language_list |
|
|
|
matches = df_search["languages"].apply( |
|
contains_language, filter_lang=language_filter |
|
) |
|
df_search = df_search[matches] |
|
|
|
|
|
if "GitHub" not in source_checkbox: |
|
df_search = df_search[df_search["source"] != "GitHub"] |
|
df_search = df_search.drop("stars", axis=1) |
|
|
|
if "Hugging Face" not in source_checkbox: |
|
df_search = df_search[df_search["source"] != "Hugging Face"] |
|
df_search = df_search.drop("downloads", axis=1) |
|
|
|
if "Dataset" in source_checkbox: |
|
df_search = df_search[df_search["type"] == "dataset"] |
|
|
|
if "Model" in source_checkbox: |
|
df_search = df_search[df_search["type"] == "model"] |
|
|
|
|
|
if "project_name" not in show_checkbox: |
|
df_search = df_search.drop("project_name", axis=1) |
|
|
|
if "downloads" not in show_checkbox: |
|
df_search = df_search.drop("downloads", axis=1) |
|
|
|
if "stars" not in show_checkbox: |
|
df_search = df_search.drop("stars", axis=1) |
|
|
|
if "first_commit" not in show_checkbox: |
|
df_search = df_search.drop("first_commit", axis=1) |
|
|
|
if "latest_commit" not in show_checkbox: |
|
df_search = df_search.drop("latest_commit", axis=1) |
|
|
|
if "description" not in show_checkbox: |
|
df_search = df_search.drop("description", axis=1) |
|
|
|
if "source" not in show_checkbox: |
|
df_search = df_search.drop("source", axis=1) |
|
|
|
if "languages" not in show_checkbox: |
|
df_search = df_search.drop("languages", axis=1) |
|
|
|
if "type" not in show_checkbox: |
|
df_search = df_search.drop("type", axis=1) |
|
|
|
for query in queries: |
|
contained_description = data["description"].str.contains(query) |
|
|
|
contained_project_name = data["project_name"].str.contains(query) |
|
df_search = df_search[contained_description | contained_project_name] |
|
return df_search |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
# Awesome Japanese NLP resources search π |
|
You can search for open-source software from [1250+ Japanese NLP repositories](https://github.com/taishi-i/awesome-japanese-nlp-resources). |
|
""" |
|
) |
|
|
|
query = gr.Textbox(label="Search words", placeholder="llm") |
|
|
|
with gr.Row(): |
|
languages = [ |
|
"Python", |
|
"Jupyter Notebook", |
|
"Java", |
|
"C++", |
|
"JavaScript", |
|
"TypeScript", |
|
"C#", |
|
"Rust", |
|
"Go", |
|
"C", |
|
"Kotlin", |
|
"Ruby", |
|
"Perl", |
|
] |
|
|
|
language_selector = gr.Dropdown( |
|
label="Programming Language", |
|
choices=languages, |
|
) |
|
|
|
source_checkbox = gr.CheckboxGroup( |
|
["GitHub", "Hugging Face", "Dataset", "Model"], |
|
value=["GitHub", "Hugging Face"], |
|
label="Source", |
|
) |
|
|
|
show_checkbox = gr.CheckboxGroup( |
|
[ |
|
"project_name", |
|
"downloads", |
|
"stars", |
|
"description", |
|
"first_commit", |
|
"latest_commit", |
|
"source", |
|
"type", |
|
"languages", |
|
], |
|
value=[ |
|
"project_name", |
|
"downloads", |
|
"stars", |
|
"description", |
|
], |
|
label="Display columns in a table", |
|
) |
|
|
|
df = gr.DataFrame( |
|
value=data, |
|
type="pandas", |
|
datatype="markdown", |
|
height=600, |
|
) |
|
|
|
query.change( |
|
fn=show_search_results, |
|
inputs=[ |
|
language_selector, |
|
query, |
|
source_checkbox, |
|
show_checkbox, |
|
], |
|
outputs=df, |
|
) |
|
|
|
language_selector.change( |
|
fn=show_search_results, |
|
inputs=[ |
|
language_selector, |
|
query, |
|
source_checkbox, |
|
show_checkbox, |
|
], |
|
outputs=df, |
|
) |
|
|
|
source_checkbox.change( |
|
fn=show_search_results, |
|
inputs=[ |
|
language_selector, |
|
query, |
|
source_checkbox, |
|
show_checkbox, |
|
], |
|
outputs=df, |
|
) |
|
|
|
show_checkbox.change( |
|
fn=show_search_results, |
|
inputs=[ |
|
language_selector, |
|
query, |
|
source_checkbox, |
|
show_checkbox, |
|
], |
|
outputs=df, |
|
) |
|
|
|
demo.launch() |
|
|