Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,161 Bytes
fbe940a 6b97460 4f83ec0 451395b 6447366 451395b 6b97460 451395b 6447366 4f83ec0 fbe940a 6b97460 fbe940a 456f50b fbe940a 6b97460 4f83ec0 6b97460 fbe940a 4f83ec0 456f50b fbe940a 4f83ec0 bb0eeb1 82dc3c2 bb0eeb1 4f83ec0 fbe940a 6b97460 4f83ec0 fbe940a 4f83ec0 6b97460 fbe940a 4f83ec0 451395b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from pathlib import Path
from urllib.parse import urlparse, parse_qs
import gradio as gr
import io
import pandas as pd
import spaces
from generate import model_id, stream_jsonl_file
MAX_SIZE = 20
DEFAULT_SEED = 42
DEFAULT_SIZE = 3
@spaces.GPU(duration=120)
def stream_output(query: str, continue_content: str = ""):
query = Path(query).name
parsed_filename = urlparse(query)
filename = parsed_filename.path
params = parse_qs(parsed_filename.query)
prompt = params["prompt"][0] if "prompt" in params else ""
columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
if size > MAX_SIZE:
raise gr.Error(f"Maximum size is {MAX_SIZE}. Duplicate this Space to remove this limit.")
content = continue_content
df = pd.read_json(io.StringIO(content), lines=True, convert_dates=False)
continue_content_size = len(df)
state_msg = f"⚙️ Generating... [{continue_content_size + 1}/{continue_content_size + size}]"
if list(df.columns):
columns = list(df.columns)
else:
df = pd.DataFrame({"1": [], "2": [], "3": []})
yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
for i, chunk in enumerate(stream_jsonl_file(
filename=filename,
prompt=prompt,
columns=columns,
seed=seed + (continue_content_size // size),
size=size,
)):
content += chunk
df = pd.read_json(io.StringIO(content), lines=True, convert_dates=False)
state_msg = f"⚙️ Generating... [{continue_content_size + i + 1}/{continue_content_size + size}]"
yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
with open(query, "w", encoding="utf-8") as f:
f.write(content)
yield df, "```json\n" + content + "\n```", gr.Button("Generate dataset"), gr.Button("Generate one more batch", visible=True, interactive=True), gr.DownloadButton("⬇️ Download", value=query, visible=True, interactive=True)
def stream_more_output(query: str):
query = Path(query).name
with open(query, "r", encoding="utf-8") as f:
continue_content = f.read()
yield from stream_output(query=query, continue_content=continue_content)
title = "LLM DataGen"
description = (
f"Generate and stream synthetic dataset files in `{{JSON Lines}}` format (currently using [{model_id}](https://huggingface.co./{model_id}))\n\n"
"Disclaimer: LLM data generation is an area of active research with known problems such as biased generation and incorrect information."
)
examples = [
"movies_data.jsonl",
"dungeon_and_dragon_characters.jsonl",
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
"common_first_names.jsonl?columns=first_name,popularity&size=10",
]
with gr.Blocks() as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
filename_comp = gr.Textbox(examples[0], placeholder=examples[0], label="File name to generate")
outputs = []
generate_button = gr.Button("Generate dataset")
with gr.Tab("Dataset"):
dataframe_comp = gr.DataFrame()
with gr.Tab("File content"):
file_content_comp = gr.Markdown()
with gr.Row():
generate_more_button = gr.Button("Generate one more batch", visible=False, interactive=False, scale=3)
download_button = gr.DownloadButton("⬇️ Download", visible=False, interactive=False, scale=1)
outputs = [dataframe_comp, file_content_comp, generate_button, generate_more_button, download_button]
examples = gr.Examples(examples, filename_comp, outputs, fn=stream_output, run_on_click=True)
generate_button.click(stream_output, filename_comp, outputs)
generate_more_button.click(stream_more_output, filename_comp, outputs)
demo.launch() |