import csv import json import os import tempfile import gradio as gr from utils import evaluate, report from transformers import AutoTokenizer # https://x.com/abidlabs/status/1721548226250371264/photo/1 # https://github.com/gradio-app/gradio/issues/5954 ga_script = """ """ ga_load = """ function() { window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-0SHLFV3PV0'); } """ def process_jsonl_file(jsonl_file_path: str, api_key: str): try: content = open(jsonl_file_path, "r", encoding="utf-8").readlines() json_data = [json.loads(line) for line in content] if api_key is not None and api_key != "": json_data = evaluate(json_data, api_key) html_content = report(tasks=json_data) file_name_with_ext = os.path.basename(jsonl_file_path) file_name, _ = os.path.splitext(file_name_with_ext) output_file = None with tempfile.NamedTemporaryFile( delete=False, prefix=f"{file_name}-report-", suffix=".html", mode="w", encoding="utf-8", ) as temp_file: temp_file.write(html_content) output_file = temp_file.name output_csv = None keys = json_data[0].keys() with tempfile.NamedTemporaryFile( delete=False, prefix=f"{file_name}-report-", suffix=".csv", mode="w", encoding="utf-8", ) as temp_file: dict_writer = csv.DictWriter(temp_file, fieldnames=keys) dict_writer.writeheader() dict_writer.writerows(json_data) output_csv = temp_file.name return output_file, output_csv, "" except Exception as e: return None, None, e with gr.Blocks(head=ga_script) as reporting: jsonl_input = gr.File(label="JSONLファイルをアップロード") api_key_input = gr.Textbox( label="GeminiのAPIキー(スコアのセルフ評価を行う場合)", type="password" ) gr.Markdown("APIキーの発行は[こちら](https://aistudio.google.com/app/apikey)") process_button = gr.Button("レポートを作成") output_file = gr.File(label="セルフ評価レポート(HTML)") output_csv = gr.File(label="セルフ評価レポート(CSV)") output_text = gr.Textbox(label="システムメッセージ") process_button.click( process_jsonl_file, inputs=[jsonl_input, api_key_input], outputs=[output_file, output_csv, output_text], ) reporting.load(None, js=ga_load) llm_jp_3 = "llm-jp/llm-jp-3-1.8b" gemma_2 = "google/gemma-2-2b" llm_jp_3_tokenizer = AutoTokenizer.from_pretrained(llm_jp_3, trust_remote_code=True) tokenizers = { "LLM-JP-3": llm_jp_3_tokenizer, } try: gemma_2_tokenizer = AutoTokenizer.from_pretrained(gemma_2, trust_remote_code=True) tokenizers["Gemma-2"] = gemma_2_tokenizer except OSError as e: print(e) tokenizer_names = list(tokenizers.keys()) def tokenize_text(text: str, tokenizer_name: str): tokenizer = tokenizers[tokenizer_name] tokens = tokenizer.tokenize(text) colors = ["#FFCCCC", "#CCFFCC", "#CCCCFF", "#FFFFCC", "#CCFFFF", "#FFCCFF"] tokenized_text = "".join( [ f'{token} ' for i, token in enumerate(tokens) ] ) token_count = len(tokens) return f"
{tokenized_text}
Token Count: {token_count}
" with gr.Blocks() as tokenization: with gr.Row(): tokenizer_dropdown = gr.Dropdown( label="Tokenizerを選択", choices=tokenizer_names, value=tokenizer_names[0] ) with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Input Text") with gr.Column(): tokenized_output = gr.HTML( tokenize_text("", tokenizer_names[0]), label="Tokenized Output" ) tokenizer_dropdown.change( tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output ) text_input.change( tokenize_text, inputs=[text_input, tokenizer_dropdown], outputs=tokenized_output ) tabbed = gr.TabbedInterface( [reporting, tokenization], tab_names=["ELYZA-tasks-100(-TV) セルフ評価", "トークンの可視化"], title="LLM開発支援ツール", ) if __name__ == "__main__": tabbed.launch()