Spaces:
Running
on
Zero
Running
on
Zero
cleaning
Browse files- api.py +2 -2
- gradio_app.py +21 -32
api.py
CHANGED
@@ -6,7 +6,7 @@ from fastapi.responses import StreamingResponse
|
|
6 |
from pydantic import BaseModel, StringConstraints
|
7 |
from outlines import generate
|
8 |
|
9 |
-
from generate import model, sampler,
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
@@ -22,7 +22,7 @@ logger.warning("Model status: " + status)
|
|
22 |
|
23 |
|
24 |
async def stream_response(filename: str, prompt: str, columns: list[str], seed: int, size: int):
|
25 |
-
for chunk in
|
26 |
filename=filename,
|
27 |
prompt=prompt,
|
28 |
columns=columns,
|
|
|
6 |
from pydantic import BaseModel, StringConstraints
|
7 |
from outlines import generate
|
8 |
|
9 |
+
from generate import model, sampler, stream_jsonl_file
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
|
|
22 |
|
23 |
|
24 |
async def stream_response(filename: str, prompt: str, columns: list[str], seed: int, size: int):
|
25 |
+
for chunk in stream_jsonl_file(
|
26 |
filename=filename,
|
27 |
prompt=prompt,
|
28 |
columns=columns,
|
gradio_app.py
CHANGED
@@ -1,25 +1,35 @@
|
|
1 |
import time
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
import io
|
5 |
import pandas as pd
|
6 |
import spaces
|
7 |
|
8 |
-
from generate import
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
@spaces.GPU(duration=120)
|
12 |
def stream_output(filename: str):
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
content = ""
|
16 |
-
size=3
|
17 |
start_time = time.time()
|
18 |
-
for i, chunk in enumerate(
|
19 |
filename=filename,
|
20 |
-
prompt=
|
21 |
-
columns=
|
22 |
-
seed=
|
23 |
size=size,
|
24 |
)):
|
25 |
content += chunk
|
@@ -31,32 +41,13 @@ def stream_output(filename: str):
|
|
31 |
)
|
32 |
yield df, "```json\n" + content + "\n```", state_msg
|
33 |
|
34 |
-
def test(filename: str):
|
35 |
-
if not filename.endswith(".jsonl"):
|
36 |
-
yield "❌ 404: File name must end with .jsonl", None, ""
|
37 |
-
return
|
38 |
-
|
39 |
-
content = ""
|
40 |
-
size = 10
|
41 |
-
start_time = time.time()
|
42 |
-
for i in range(size):
|
43 |
-
content += f'{{"i": {i}, "filename": "{filename}"}}\n'
|
44 |
-
df = pd.read_json(io.StringIO(content), lines=True)
|
45 |
-
state_msg = (
|
46 |
-
f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s"
|
47 |
-
if i + 1 == size else
|
48 |
-
f"⚙️ Generating... [{i + 1}/{size}]"
|
49 |
-
)
|
50 |
-
yield df, "```json\n" + content + "\n```", state_msg
|
51 |
-
time.sleep(0.1)
|
52 |
-
|
53 |
title = "LLM DataGen"
|
54 |
description = "Generate and stream synthetic dataset files in JSON Lines format"
|
55 |
examples = [
|
56 |
"movies_data.jsonl",
|
57 |
-
"common_first_names.jsonl",
|
58 |
-
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
|
59 |
"dungeon_and_dragon_characters.jsonl"
|
|
|
|
|
60 |
]
|
61 |
|
62 |
with gr.Blocks() as demo:
|
@@ -69,9 +60,7 @@ with gr.Blocks() as demo:
|
|
69 |
with gr.Tab("Dataset"):
|
70 |
dataframe_comp = gr.DataFrame()
|
71 |
with gr.Tab("File content"):
|
72 |
-
|
73 |
-
with gr.Row():
|
74 |
-
file_content_comp = gr.Markdown()
|
75 |
|
76 |
generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])
|
77 |
|
|
|
1 |
import time
|
2 |
+
from urllib.parse import urlparse, parse_qs
|
3 |
|
4 |
import gradio as gr
|
5 |
import io
|
6 |
import pandas as pd
|
7 |
import spaces
|
8 |
|
9 |
+
from generate import stream_jsonl_file
|
10 |
|
11 |
+
MAX_SIZE = 20
|
12 |
+
DEFAULT_SEED = 42
|
13 |
+
DEFAULT_SIZE = 3
|
14 |
|
15 |
@spaces.GPU(duration=120)
|
16 |
def stream_output(filename: str):
|
17 |
+
parsed_filename = urlparse(filename)
|
18 |
+
filename = parsed_filename.path
|
19 |
+
params = parse_qs(parsed_filename.query)
|
20 |
+
prompt = params["prompt"][0] if "prompt" in params else ""
|
21 |
+
columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
|
22 |
+
size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
|
23 |
+
seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
|
24 |
+
if size > MAX_SIZE:
|
25 |
+
yield None, None, "Error: Maximum size is 20"
|
26 |
content = ""
|
|
|
27 |
start_time = time.time()
|
28 |
+
for i, chunk in enumerate(stream_jsonl_file(
|
29 |
filename=filename,
|
30 |
+
prompt=prompt,
|
31 |
+
columns=columns,
|
32 |
+
seed=seed,
|
33 |
size=size,
|
34 |
)):
|
35 |
content += chunk
|
|
|
41 |
)
|
42 |
yield df, "```json\n" + content + "\n```", state_msg
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
title = "LLM DataGen"
|
45 |
description = "Generate and stream synthetic dataset files in JSON Lines format"
|
46 |
examples = [
|
47 |
"movies_data.jsonl",
|
|
|
|
|
48 |
"dungeon_and_dragon_characters.jsonl"
|
49 |
+
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
|
50 |
+
"common_first_names.jsonl?columns=first_name,popularity&size=10",
|
51 |
]
|
52 |
|
53 |
with gr.Blocks() as demo:
|
|
|
60 |
with gr.Tab("Dataset"):
|
61 |
dataframe_comp = gr.DataFrame()
|
62 |
with gr.Tab("File content"):
|
63 |
+
file_content_comp = gr.Markdown()
|
|
|
|
|
64 |
|
65 |
generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])
|
66 |
|