lhoestq HF staff commited on
Commit
6b97460
1 Parent(s): 72a89db
Files changed (2) hide show
  1. api.py +2 -2
  2. gradio_app.py +21 -32
api.py CHANGED
@@ -6,7 +6,7 @@ from fastapi.responses import StreamingResponse
6
  from pydantic import BaseModel, StringConstraints
7
  from outlines import generate
8
 
9
- from generate import model, sampler, stream_file
10
 
11
  logger = logging.getLogger(__name__)
12
 
@@ -22,7 +22,7 @@ logger.warning("Model status: " + status)
22
 
23
 
24
  async def stream_response(filename: str, prompt: str, columns: list[str], seed: int, size: int):
25
- for chunk in stream_file(
26
  filename=filename,
27
  prompt=prompt,
28
  columns=columns,
 
6
  from pydantic import BaseModel, StringConstraints
7
  from outlines import generate
8
 
9
+ from generate import model, sampler, stream_jsonl_file
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
22
 
23
 
24
  async def stream_response(filename: str, prompt: str, columns: list[str], seed: int, size: int):
25
+ for chunk in stream_jsonl_file(
26
  filename=filename,
27
  prompt=prompt,
28
  columns=columns,
gradio_app.py CHANGED
@@ -1,25 +1,35 @@
1
  import time
 
2
 
3
  import gradio as gr
4
  import io
5
  import pandas as pd
6
  import spaces
7
 
8
- from generate import stream_file
9
 
 
 
 
10
 
11
  @spaces.GPU(duration=120)
12
  def stream_output(filename: str):
13
- if filename.endswith(".jsonl"):
14
- filename = filename[:-len(".jsonl")]
 
 
 
 
 
 
 
15
  content = ""
16
- size=3
17
  start_time = time.time()
18
- for i, chunk in enumerate(stream_file(
19
  filename=filename,
20
- prompt="",
21
- columns=[],
22
- seed=42,
23
  size=size,
24
  )):
25
  content += chunk
@@ -31,32 +41,13 @@ def stream_output(filename: str):
31
  )
32
  yield df, "```json\n" + content + "\n```", state_msg
33
 
34
- def test(filename: str):
35
- if not filename.endswith(".jsonl"):
36
- yield "❌ 404: File name must end with .jsonl", None, ""
37
- return
38
-
39
- content = ""
40
- size = 10
41
- start_time = time.time()
42
- for i in range(size):
43
- content += f'{{"i": {i}, "filename": "{filename}"}}\n'
44
- df = pd.read_json(io.StringIO(content), lines=True)
45
- state_msg = (
46
- f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s"
47
- if i + 1 == size else
48
- f"⚙️ Generating... [{i + 1}/{size}]"
49
- )
50
- yield df, "```json\n" + content + "\n```", state_msg
51
- time.sleep(0.1)
52
-
53
  title = "LLM DataGen"
54
  description = "Generate and stream synthetic dataset files in JSON Lines format"
55
  examples = [
56
  "movies_data.jsonl",
57
- "common_first_names.jsonl",
58
- "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
59
  "dungeon_and_dragon_characters.jsonl"
 
 
60
  ]
61
 
62
  with gr.Blocks() as demo:
@@ -69,9 +60,7 @@ with gr.Blocks() as demo:
69
  with gr.Tab("Dataset"):
70
  dataframe_comp = gr.DataFrame()
71
  with gr.Tab("File content"):
72
- with gr.Blocks(fill_height=True):
73
- with gr.Row():
74
- file_content_comp = gr.Markdown()
75
 
76
  generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])
77
 
 
1
  import time
2
+ from urllib.parse import urlparse, parse_qs
3
 
4
  import gradio as gr
5
  import io
6
  import pandas as pd
7
  import spaces
8
 
9
+ from generate import stream_jsonl_file
10
 
11
+ MAX_SIZE = 20
12
+ DEFAULT_SEED = 42
13
+ DEFAULT_SIZE = 3
14
 
15
  @spaces.GPU(duration=120)
16
  def stream_output(filename: str):
17
+ parsed_filename = urlparse(filename)
18
+ filename = parsed_filename.path
19
+ params = parse_qs(parsed_filename.query)
20
+ prompt = params["prompt"][0] if "prompt" in params else ""
21
+ columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
22
+ size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
23
+ seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
24
+ if size > MAX_SIZE:
25
+ yield None, None, "Error: Maximum size is 20"
26
  content = ""
 
27
  start_time = time.time()
28
+ for i, chunk in enumerate(stream_jsonl_file(
29
  filename=filename,
30
+ prompt=prompt,
31
+ columns=columns,
32
+ seed=seed,
33
  size=size,
34
  )):
35
  content += chunk
 
41
  )
42
  yield df, "```json\n" + content + "\n```", state_msg
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  title = "LLM DataGen"
45
  description = "Generate and stream synthetic dataset files in JSON Lines format"
46
  examples = [
47
  "movies_data.jsonl",
 
 
48
  "dungeon_and_dragon_characters.jsonl"
49
+ "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
50
+ "common_first_names.jsonl?columns=first_name,popularity&size=10",
51
  ]
52
 
53
  with gr.Blocks() as demo:
 
60
  with gr.Tab("Dataset"):
61
  dataframe_comp = gr.DataFrame()
62
  with gr.Tab("File content"):
63
+ file_content_comp = gr.Markdown()
 
 
64
 
65
  generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])
66