Spaces:

Bradarr
/

DatasetManager

Running

App Files Files Community

Bradarr commited on 1 day ago

Commit

52420e4

verified ·

1 Parent(s): 9e2a193

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -54

app.py CHANGED Viewed

@@ -1,41 +1,19 @@
 import gradio as gr
 from datasets import load_dataset, Features, Value, Audio, Dataset
 from huggingface_hub import HfApi, create_repo
-import os
-# --- Configuration --- (Moved inside functions where needed, for Gradio)
-animal_keywords = [
-    "dog", "cat", "bird", "fish", "horse", "cow", "sheep", "pig", "chicken",
-    "duck", "goat", "lion", "tiger", "bear", "elephant", "monkey", "zebra",
-    "giraffe", "rhino", "hippo", "crocodile", "snake", "frog", "turtle",
-    "lizard", "spider", "ant", "bee", "butterfly", "wolf", "fox", "deer",
-    "rabbit", "squirrel", "mouse", "rat", "hamster", "guinea pig", "parrot",
-    "owl", "eagle", "hawk", "penguin", "dolphin", "whale", "shark", "seal",
-    "octopus", "crab", "lobster", "shrimp", "snail", "worm", "kangaroo", "koala",
-    "panda", "sloth", "hedgehog", "raccoon", "skunk", "beaver", "otter",
-    "platypus", "jaguar", "leopard", "cheetah", "puma", "ostrich", "emu",
-    "flamingo", "peacock", "swan", "goose", "turkey", "pigeon", "seagull", "antelope",
-    "bison", "buffalo", "camel", "llama", "alpaca", "donkey", "mule", "ferret",
-    "mongoose", "meerkat", "wombat", "dingo", "armadillo", "badger", "chipmunk", "porcupine"
-]
-def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id, hf_token):
-    """Filters a dataset based on keywords and pushes it to the Hub."""
-    if not hf_token:
-        return "Error: Hugging Face token is required. Please provide it.", None
     try:
         # --- 1. Load the dataset in streaming mode ---
         dataset = load_dataset(dataset_name, split=split_name, streaming=True)
         # --- 2. Filter the dataset (streaming) ---
-        # Process keywords: split the comma-separated string, strip whitespace
         keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
         if not keywords:
-          keywords = animal_keywords
-          # return "Error: No keywords provided. Please enter at least one keyword.", None
         filtered_dataset = dataset.filter(
             lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
@@ -43,48 +21,79 @@ def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id
         # --- 3.  Select Indices (Efficiently) ---
         matching_indices = []
         for i, example in enumerate(filtered_dataset):
             matching_indices.append(i)
         if not matching_indices:
-            return "No matching examples found with the provided keywords.", None
-        # --- 4. Create the Subset Using .select() ---
         full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
-        subset_dataset = full_dataset.select(matching_indices)
         # --- 5. Define features (for consistent schema) ---
         features = Features({
             'prompt': Value(dtype='string', id=None),
-            'audio': Audio(sampling_rate=16000),  # Keep original sampling rate, adjust if needed
             'strategy': Value(dtype='string', id=None),
             'seed': Value(dtype='int64', id=None)
         })
         try:
-          subset_dataset = subset_dataset.cast(features)  # Cast to ensure features match
         except Exception as e:
-           return f"An error occurred during casting please ensure that the dataset selected has the correct collumns: {e}", None
-        # --- 6. Upload the Subset Dataset ---
-        api = HfApi(token=hf_token)
-        # Create a repository (if it doesn't exist)
         try:
             create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
             print(f"Repository '{new_dataset_repo_id}' created.")
         except Exception as e:
-             if "Repo already exists" not in str(e):
                 return f"Error creating repository: {e}", None
-        # Upload to the Hugging Face Hub
-        subset_dataset.push_to_hub(new_dataset_repo_id)
         dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
-        return f"Subset dataset uploaded successfully!  {len(matching_indices)} Examples Found", dataset_url
     except Exception as e:
-        return f"An error occurred: {e}", None
 # --- Gradio Interface ---
@@ -92,26 +101,44 @@ with gr.Blocks() as demo:
     gr.Markdown("# Dataset Filter and Push")
     with gr.Row():
-        dataset_name_input = gr.Textbox(label="Source Dataset Name (e.g., declare-lab/audio-alpaca)", value="declare-lab/audio-alpaca")
-        split_name_input = gr.Textbox(label="Split Name (e.g., train)", value="train")
-    keywords_input = gr.Textbox(label="Keywords (comma-separated, e.g., dog, cat, bird)", value="dog, cat, bird")
-    with gr.Row():
-        new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID (e.g., your_username/your_dataset)")
-        hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
-    submit_button = gr.Button("Filter and Push")
     with gr.Row():
-       output_text = gr.Textbox(label="Status")
-       dataset_output_link = gr.Textbox(label="Dataset URL")
-    submit_button.click(
-        filter_and_push,
-        inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
-        outputs=[output_text, dataset_output_link],
     )
 if __name__ == "__main__":

 import gradio as gr
 from datasets import load_dataset, Features, Value, Audio, Dataset
 from huggingface_hub import HfApi, create_repo
+import pandas as pd  # Import pandas for displaying the dataset
+def filter_dataset(dataset_name, split_name, keywords_text):
+    """Filters a dataset based on keywords and returns a Pandas DataFrame."""
     try:
         # --- 1. Load the dataset in streaming mode ---
         dataset = load_dataset(dataset_name, split=split_name, streaming=True)
         # --- 2. Filter the dataset (streaming) ---
         keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
         if not keywords:
+            return pd.DataFrame(), "Error: No keywords provided."
         filtered_dataset = dataset.filter(
             lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
         # --- 3.  Select Indices (Efficiently) ---
         matching_indices = []
+        data_for_df = []  # Store data for DataFrame
         for i, example in enumerate(filtered_dataset):
             matching_indices.append(i)
+            #  Extract data and append.  Crucially, *decode* audio here.
+            example_data = {
+                'prompt': example['prompt'],
+                'strategy': example['strategy'],
+                'seed': example['seed'],
+                'audio': example['audio']['array']  # Get the NumPy array
+            }
+            data_for_df.append(example_data)
         if not matching_indices:
+            return pd.DataFrame(), "No matching examples found."
+        # --- 4. Create Pandas DataFrame ---
+        df = pd.DataFrame(data_for_df)
+        return df, f"Found {len(matching_indices)} matching examples."
+    except Exception as e:
+        return pd.DataFrame(), f"An error occurred: {e}"
+def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
+    """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
+    if not hf_token:
+        return "Error: Hugging Face token is required.", None
+    try:
+        # Convert JSON back to DataFrame
+        df = pd.read_json(df_json)
+        if df.empty:
+            return "Error: Cannot push an empty dataset",None
+        # Convert DataFrame to Hugging Face Dataset
+        dataset = Dataset.from_pandas(df)
+        # --- Load original (for feature definition)
         full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
+        if len(full_dataset) == 0:
+            return "Error: Source Dataset Appears Empty",None
         # --- 5. Define features (for consistent schema) ---
         features = Features({
             'prompt': Value(dtype='string', id=None),
+            'audio': Audio(sampling_rate=16000),
             'strategy': Value(dtype='string', id=None),
             'seed': Value(dtype='int64', id=None)
         })
         try:
+          dataset = dataset.cast(features)
         except Exception as e:
+            return f"An error occurred: {e}",None
+        # --- 6. Upload to the Hugging Face Hub ---
+        api = HfApi(token=hf_token)
         try:
             create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
             print(f"Repository '{new_dataset_repo_id}' created.")
         except Exception as e:
+            if "Repo already exists" not in str(e):
                 return f"Error creating repository: {e}", None
+        dataset.push_to_hub(new_dataset_repo_id)
         dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
+        return f"Subset dataset uploaded successfully!", dataset_url
     except Exception as e:
+        return f"An error occurred during push: {e}", None
 # --- Gradio Interface ---
     gr.Markdown("# Dataset Filter and Push")
     with gr.Row():
+        dataset_name_input = gr.Textbox(label="Source Dataset Name", value="declare-lab/audio-alpaca")
+        split_name_input = gr.Textbox(label="Split Name", value="train")
+    keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
+    filter_button = gr.Button("Filter Dataset")
+    #  Display the filtered data.  'label' is important for presentation.
+    filtered_data_output = gr.Dataframe(label="Filtered Data")
+    filter_status_output = gr.Textbox(label="Filter Status")
     with gr.Row():
+        new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
+        hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
+    push_button = gr.Button("Push to Hub")
+    push_status_output = gr.Textbox(label="Push Status")
+    dataset_url_output = gr.Textbox(label="Dataset URL")  # Display the dataset URL
+    # Hidden component to store the filtered dataset (as JSON)
+    filtered_data_json = gr.JSON(visible=False)
+    # Connect the filter button
+    filter_button.click(
+        filter_dataset,
+        inputs=[dataset_name_input, split_name_input, keywords_input],
+        outputs=[filtered_data_output, filter_status_output]
+    ).then(  # Use .then() to chain actions
+        lambda df: df.to_json(),  # Convert DataFrame to JSON
+        inputs=[filtered_data_output],
+        outputs=[filtered_data_json]  # Store in the hidden JSON component
+    )
+    # Connect the push button
+    push_button.click(
+        push_to_hub,
+        inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
+        outputs=[push_status_output, dataset_url_output]
     )
 if __name__ == "__main__":