Hiroaki Ogasawara
feat: csv report
aa428bd
raw
history blame
4.16 kB
import csv
import json
import os
import tempfile
import gradio as gr
from utils import evaluate, report
from transformers import AutoTokenizer
def process_jsonl_file(jsonl_file_path: str, api_key: str):
try:
content = open(jsonl_file_path, "r", encoding="utf-8").readlines()
json_data = [json.loads(line) for line in content]
if api_key is not None and api_key != "":
json_data = evaluate(json_data, api_key)
html_content = report(tasks=json_data)
file_name_with_ext = os.path.basename(jsonl_file_path)
file_name, _ = os.path.splitext(file_name_with_ext)
output_file = None
with tempfile.NamedTemporaryFile(
delete=False,
prefix=f"{file_name}-report-",
suffix=".html",
mode="w",
encoding="utf-8",
) as temp_file:
temp_file.write(html_content)
output_file = temp_file.name
output_csv = None
keys = json_data[0].keys()
with tempfile.NamedTemporaryFile(
delete=False,
prefix=f"{file_name}-report-",
suffix=".csv",
mode="w",
encoding="utf-8",
) as temp_file:
dict_writer = csv.DictWriter(temp_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(json_data)
output_csv = temp_file.name
return output_file, output_csv, ""
except Exception as e:
return None, None, e
# Gradioデモ
with gr.Blocks() as reporting:
jsonl_input = gr.File(label="JSONLファイルをアップロード")
api_key_input = gr.Textbox(
label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password"
)
gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)")
process_button = gr.Button("レポートを作成")
output_file = gr.File(label="セルフ評価レポート(HTML)")
output_csv = gr.File(label="セルフ評価レポート(CSV)")
output_text = gr.Textbox(label="システムメッセージ")
process_button.click(
process_jsonl_file,
inputs=[jsonl_input, api_key_input],
outputs=[output_file, output_csv, output_text],
)
llm_jp_3 = "llm-jp/llm-jp-3-1.8b"
gemma_2 = "google/gemma-2-2b"
llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True)
tokenizers = {
"LLM-JP-3": llm_jp_3_tokenizer,
}
try:
gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True)
tokenizers["Gemma-2"] = gemma_2_tokenizer
except OSError as e:
print(e)
tokenizer_names = list(tokenizers.keys())
def tokenize_text(text: str, tokenizer_name: str):
tokenizer = tokenizers[tokenizer_name]
tokens = tokenizer.tokenize(text)
colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"]
tokenized_text = "".join(
[
f'<span style="background-color:{colors[i % len(colors)]}">{token}</span> '
for i, token in enumerate(tokens)
]
)
token_count = len(tokens)
return f"<p>{tokenized_text}</p><p>Token Count: {token_count}</p>"
with gr.Blocks() as tokenization:
with gr.Row():
tokenizer_dropdown = gr.Dropdown(
label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0]
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Input Text")
with gr.Column():
tokenized_output = gr.HTML(
tokenize_text("", tokenizer_names[0]), label="Tokenized Output"
)
tokenizer_dropdown.change(
tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
)
text_input.change(
tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output
)
tabbed = gr.TabbedInterface(
[reporting, tokenization],
tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークンの可視化"],
title="LLM開発支援ツール",
)
if __name__ == "__main__":
tabbed.launch()