infinite-dataset-hub

Sleeping

App Files Files Community

lhoestq HF staff commited on Aug 14, 2024

Commit

4482b40

1 Parent(s): e4a82b3

add full generation

Browse files

Files changed (1) hide show

app.py +227 -28

app.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import time
 from itertools import islice
 from functools import partial
-from typing import Iterable, Iterator, TypeVar
 import gradio as gr
 import requests.exceptions
 from huggingface_hub import InferenceClient
@@ -13,6 +18,8 @@ client = InferenceClient(model_id)
 MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
@@ -29,6 +36,23 @@ GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
     "Focus on quality text content and and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). "
     "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
 )
 landing_page_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news"
@@ -174,27 +198,24 @@ with gr.Blocks(css=css) as demo:
             with gr.Column(scale=4, min_width=0):
                 pass
     with gr.Column(visible=False) as dataset_page:
-        with gr.Row():
-            with gr.Column(scale=4, min_width=0):
-                pass
-            with gr.Column(scale=10):
-                dataset_title = gr.Markdown()
-                dataset_content = gr.Markdown()
-                with gr.Row():
-                    with gr.Column(scale=4, min_width=0):
-                        pass
-                    with gr.Column():
-                        generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")  # TODO: implement
-                        dataset_share_button = gr.Button("Share Dataset URL")
-                        dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
-                        back_button = gr.Button("< Back", size="sm")
-                    with gr.Column(scale=4, min_width=0):
-                        pass
-            with gr.Column(scale=4, min_width=0):
-                pass
     app_state = gr.State({})
     T = TypeVar("T")
     def batched(it: Iterable[T], n: int) -> Iterator[list[T]]:
@@ -264,6 +285,139 @@ with gr.Blocks(css=css) as demo:
         print("-----\n\n" + generated_text)
     def _search_datasets(search_query):
         yield {generated_texts_state: [], app_state: {"search_query": search_query}}
         yield {
@@ -303,11 +457,13 @@ with gr.Blocks(css=css) as demo:
     @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state, app_state])
     def search_dataset_from_search_button(search_query):
         yield from _search_datasets(search_query)
     @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state, app_state])
     def search_dataset_from_search_bar(search_query):
         yield from _search_datasets(search_query)
     @load_more_datasets.click(inputs=[search_bar, generated_texts_state], outputs=button_groups + buttons + [generated_texts_state])
     def search_more_datasets(search_query, generated_texts):
         current_item_idx = initial_item_idx = len(generated_texts) * MAX_NB_ITEMS_PER_GENERATION_CALL
@@ -339,8 +495,11 @@ with gr.Blocks(css=css) as demo:
         yield {
             search_page: gr.Column(visible=False),
             dataset_page: gr.Column(visible=True),
-            dataset_title: f"# {dataset_name}\n\n tags: {tags}\n\n _Note: This is an AI-generated dataset so its content may be inaccurate or false_",
             dataset_share_textbox: gr.Textbox(visible=False),
             app_state: {
                 "search_query": search_query,
                 "dataset_name": dataset_name,
@@ -352,7 +511,7 @@ with gr.Blocks(css=css) as demo:
     show_dataset_inputs = [search_bar, *buttons]
-    show_dataset_outputs = [app_state, search_page, dataset_page, dataset_title, dataset_content, dataset_share_textbox]
     scroll_to_top_js = """
     function (...args) {
         console.log(args);
@@ -363,7 +522,7 @@ with gr.Blocks(css=css) as demo:
         }
         return args;
     }
-    """.replace("len(show_dataset_inputs)", str(len(show_dataset_inputs)))
     def show_dataset_from_button(search_query, *buttons_values, i):
         dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
@@ -374,18 +533,58 @@ with gr.Blocks(css=css) as demo:
         tags_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
-    @back_button.click(outputs=[search_page, dataset_page])
     def show_search_page():
         return gr.Column(visible=True), gr.Column(visible=False)
-    @generate_full_dataset_button.click()
-    def generate_full_dataset():
-        raise gr.Error("Not implemented yet sorry ! Request your dataset in the Discussion tab (provide the dataset URL)")
     @dataset_share_button.click(inputs=[app_state], outputs=[dataset_share_textbox])
     def show_dataset_url(state):
         return gr.Textbox(
-            f"{URL}?q={state['search_query'].replace(' ', '+')}&dataset={state['dataset_name']}&tags={state['tags']}",
             visible=True,
         )

+import io
+import re
 import time
 from itertools import islice
 from functools import partial
+from multiprocessing.pool import ThreadPool
+from queue import Queue, Empty
+from typing import Callable, Iterable, Iterator, Optional, TypeVar
 import gradio as gr
+import pandas as pd
 import requests.exceptions
 from huggingface_hub import InferenceClient
 MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
+NUM_ROWS = 100
+NUM_VARIANTS = 10
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
     "Focus on quality text content and and use a 'label' or 'labels' column if it makes sense (invent labels, avoid reusing the keywords, be accurate while labelling texts). "
     "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
 )
+GENERATE_MORE_ROWS = "Can you give me 10 additional samples in CSV format as well ? Use the same CSV header '{csv_header}'."
+GENERATE_VARIANTS_WITH_RARITY_AND_LABEL = "Focus on generating samples for the label '{label}' and ideally generate {rarity} samples."
+GENERATE_VARIANTS_WITH_RARITY = "Focus on generating {rarity} samples."
+RARITIES = ["pretty obvious", "common/regular", "unexpected but useful", "uncommon but still plausible", "rare/niche but still plausible"]
+LONG_RARITIES = [
+    "obvious",
+    "expected",
+    "common",
+    "regular",
+    "unexpected but useful"
+    "original but useful",
+    "specific but not far-fetched",
+    "uncommon but still plausible",
+    "rare but still plausible",
+    "very nice but still plausible",
+]
 landing_page_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news"
             with gr.Column(scale=4, min_width=0):
                 pass
     with gr.Column(visible=False) as dataset_page:
+        dataset_title = gr.Markdown()
+        gr.Markdown("_Note: This is an AI-generated dataset so its content may be inaccurate or false_")
+        dataset_content = gr.Markdown()
+        generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
+        dataset_dataframe = gr.DataFrame(visible=False, interactive=False, wrap=True)
+        save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
+        dataset_share_button = gr.Button("Share Dataset URL")
+        dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
+        back_button = gr.Button("< Back", size="sm")
     app_state = gr.State({})
+    ###################################
+    #
+    #       Utils
+    #
+    ###################################
     T = TypeVar("T")
     def batched(it: Iterable[T], n: int) -> Iterator[list[T]]:
         print("-----\n\n" + generated_text)
+    def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
+        for i, result in enumerate(func(**kwargs)):
+            queue.put(result)
+        return None
+    def iflatmap_unordered(
+        func: Callable[..., Iterable[T]],
+        *,
+        kwargs_iterable: Iterable[dict],
+    ) -> Iterable[T]:
+        queue = Queue()
+        with ThreadPool() as pool:
+            async_results = [
+                pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable
+            ]
+            try:
+                while True:
+                    try:
+                        yield queue.get(timeout=0.05)
+                    except Empty:
+                        if all(async_result.ready() for async_result in async_results) and queue.empty():
+                            break
+            finally:
+                # we get the result in case there's an error to raise
+                [async_result.get(timeout=0.05) for async_result in async_results]
+    def generate_partial_dataset(title: str, content: str, search_query: str, variant: str, csv_header: str, output: list[dict[str, str]], indices_to_generate: list[int], max_tokens=1500) -> Iterator[int]:
+        dataset_name, tags = title.strip("# ").split("\ntags:", 1)
+        dataset_name, tags = dataset_name.strip(), tags.strip()
+        messages = [
+            {
+                "role": "user",
+                "content": GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
+                    dataset_name=dataset_name,
+                    tags=tags,
+                    search_query=search_query,
+                )
+            },
+            {"role": "assistant", "content": title + "\n\n" + content},
+            {"role": "user", "content": GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant},
+        ]
+        for _ in range(3):
+            generated_text = ""
+            generated_csv = ""
+            current_line = ""
+            nb_samples = 0
+            _in_csv = False
+            try:
+                for message in client.chat_completion(
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    stream=True,
+                    top_p=0.8,
+                    seed=42,
+                ):
+                    if nb_samples >= len(indices_to_generate):
+                        break
+                    current_line += message.choices[0].delta.content
+                    generated_text += message.choices[0].delta.content
+                    if current_line.endswith("\n"):
+                        _in_csv = _in_csv ^ current_line.lstrip().startswith("```")
+                        if current_line.strip() and _in_csv and not current_line.lstrip().startswith("```"):
+                            generated_csv += current_line
+                            try:
+                                generated_df = parse_csv_df(generated_csv.strip(), csv_header=csv_header)
+                                if len(generated_df) > nb_samples:
+                                    output[indices_to_generate[nb_samples]] = generated_df.iloc[-1].to_dict()
+                                    nb_samples += 1
+                                    yield 1
+                            except Exception:
+                                pass
+                        current_line = ""
+            except requests.exceptions.ConnectionError as e:
+                print(e + "\n\nRetrying in 1sec")
+                time.sleep(1)
+                continue
+            break
+        # for debugging
+        # with open(f"output{indices_to_generate[0]}.txt", "w") as f:
+        #     f.write(generated_text)
+    def generate_variants(preview_df: pd.DataFrame):
+        label_candidate_columns = [column for column in preview_df.columns if "label" in column.lower()]
+        if label_candidate_columns:
+            labels = preview_df[label_candidate_columns[0]].unique()
+            if len(labels) > 1:
+                return [
+                    GENERATE_VARIANTS_WITH_RARITY_AND_LABEL.format(rarity=rarity, label=label)
+                    for rarity in RARITIES
+                    for label in labels
+                ]
+        return [
+            GENERATE_VARIANTS_WITH_RARITY.format(rarity=rarity)
+            for rarity in LONG_RARITIES
+        ]
+    def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
+        _in_csv = False
+        csv = "\n".join(
+            line for line in content.split("\n") if line.strip()
+            and (_in_csv := (_in_csv ^ line.lstrip().startswith("```")))
+            and not line.lstrip().startswith("```")
+        )
+        if not csv:
+            raise gr.Error("Failed to parse CSV Preview")
+        return csv.split("\n")[0], parse_csv_df(csv)
+    def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
+        # Fix generation mistake when providing a list that is not in quotes
+        if ",[" in csv:
+            for match in re.finditer(r'\[("[\w ]+"[, ]?)+\]', csv):
+                span = match.string[match.start() : match.end()]
+                csv = csv.replace(span, '"' + span.replace('"', "'") + '"')
+        # Add header if missing
+        if csv_header and csv.strip().split("\n")[0] != csv_header:
+            csv = csv_header + "\n" + csv
+        # Read CSV
+        df = pd.read_csv(io.StringIO(csv))
+        return df
+    ###################################
+    #
+    #       Buttons
+    #
+    ###################################
     def _search_datasets(search_query):
         yield {generated_texts_state: [], app_state: {"search_query": search_query}}
         yield {
     @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state, app_state])
     def search_dataset_from_search_button(search_query):
         yield from _search_datasets(search_query)
     @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state, app_state])
     def search_dataset_from_search_bar(search_query):
         yield from _search_datasets(search_query)
     @load_more_datasets.click(inputs=[search_bar, generated_texts_state], outputs=button_groups + buttons + [generated_texts_state])
     def search_more_datasets(search_query, generated_texts):
         current_item_idx = initial_item_idx = len(generated_texts) * MAX_NB_ITEMS_PER_GENERATION_CALL
         yield {
             search_page: gr.Column(visible=False),
             dataset_page: gr.Column(visible=True),
+            dataset_title: f"# {dataset_name}\n\n tags: {tags}",
             dataset_share_textbox: gr.Textbox(visible=False),
+            dataset_dataframe: gr.DataFrame(visible=False),
+            generate_full_dataset_button: gr.Button(visible=True),
+            save_dataset_button: gr.Button(visible=False),
             app_state: {
                 "search_query": search_query,
                 "dataset_name": dataset_name,
     show_dataset_inputs = [search_bar, *buttons]
+    show_dataset_outputs = [app_state, search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, dataset_share_textbox]
     scroll_to_top_js = """
     function (...args) {
         console.log(args);
         }
         return args;
     }
+    """
     def show_dataset_from_button(search_query, *buttons_values, i):
         dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
         tags_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
+    @back_button.click(outputs=[search_page, dataset_page], js=scroll_to_top_js)
     def show_search_page():
         return gr.Column(visible=True), gr.Column(visible=False)
+    @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
+    def generate_full_dataset(title, content, search_query):
+        csv_header, preview_df = parse_preview_df(content)
+        # Remove dummy "id" columns
+        for column_name, values in preview_df.to_dict(orient="series").items():
+            try:
+                if [int(v) for v in values] == list(range(len(preview_df))):
+                    preview_df = preview_df.drop(columns=column_name)
+                if [int(v) for v in values] == list(range(1, len(preview_df) + 1)):
+                    preview_df = preview_df.drop(columns=column_name)
+            except Exception:
+                pass
+        columns = list(preview_df)
+        output: list[Optional[dict]] = [None] * NUM_ROWS
+        output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
+        yield {
+            dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
+            generate_full_dataset_button: gr.Button(visible=False),
+            save_dataset_button: gr.Button(visible=True, interactive=False)
+        }
+        kwargs_iterable = [
+            {
+                "title": title,
+                "content": content,
+                "search_query": search_query,
+                "variant": variant,
+                "csv_header": csv_header,
+                "output": output,
+                "indices_to_generate": list(range(len(preview_df) + i, NUM_ROWS, NUM_VARIANTS)),
+            }
+            for i, variant in enumerate(islice(generate_variants(preview_df), NUM_VARIANTS))
+        ]
+        for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
+            yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
+        yield {save_dataset_button: gr.Button(visible=True, interactive=True)}
+        print(f"Sucessfulyl generated {dataset_name} !")
+    @save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe])
+    def save_dataset(title, content, search_query, df):
+        raise gr.Error("Not implemented yet sorry ! Request your dataset to be saved in the Discussion tab (provide the dataset URL)")
     @dataset_share_button.click(inputs=[app_state], outputs=[dataset_share_textbox])
     def show_dataset_url(state):
         return gr.Textbox(
+            f"{URL}?q={state['search_query'].replace(' ', '+')}&dataset={state['dataset_name'].replace(' ', '+')}&tags={state['tags'].replace(' ', '+')}",
             visible=True,
         )