Bradarr commited on
Commit
52420e4
·
verified ·
1 Parent(s): 9e2a193

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -54
app.py CHANGED
@@ -1,41 +1,19 @@
1
  import gradio as gr
2
  from datasets import load_dataset, Features, Value, Audio, Dataset
3
  from huggingface_hub import HfApi, create_repo
4
- import os
5
-
6
- # --- Configuration --- (Moved inside functions where needed, for Gradio)
7
- animal_keywords = [
8
- "dog", "cat", "bird", "fish", "horse", "cow", "sheep", "pig", "chicken",
9
- "duck", "goat", "lion", "tiger", "bear", "elephant", "monkey", "zebra",
10
- "giraffe", "rhino", "hippo", "crocodile", "snake", "frog", "turtle",
11
- "lizard", "spider", "ant", "bee", "butterfly", "wolf", "fox", "deer",
12
- "rabbit", "squirrel", "mouse", "rat", "hamster", "guinea pig", "parrot",
13
- "owl", "eagle", "hawk", "penguin", "dolphin", "whale", "shark", "seal",
14
- "octopus", "crab", "lobster", "shrimp", "snail", "worm", "kangaroo", "koala",
15
- "panda", "sloth", "hedgehog", "raccoon", "skunk", "beaver", "otter",
16
- "platypus", "jaguar", "leopard", "cheetah", "puma", "ostrich", "emu",
17
- "flamingo", "peacock", "swan", "goose", "turkey", "pigeon", "seagull", "antelope",
18
- "bison", "buffalo", "camel", "llama", "alpaca", "donkey", "mule", "ferret",
19
- "mongoose", "meerkat", "wombat", "dingo", "armadillo", "badger", "chipmunk", "porcupine"
20
- ]
21
-
22
-
23
- def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id, hf_token):
24
- """Filters a dataset based on keywords and pushes it to the Hub."""
25
 
26
- if not hf_token:
27
- return "Error: Hugging Face token is required. Please provide it.", None
28
 
 
 
29
  try:
30
  # --- 1. Load the dataset in streaming mode ---
31
  dataset = load_dataset(dataset_name, split=split_name, streaming=True)
32
 
33
  # --- 2. Filter the dataset (streaming) ---
34
- # Process keywords: split the comma-separated string, strip whitespace
35
  keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
36
  if not keywords:
37
- keywords = animal_keywords
38
- # return "Error: No keywords provided. Please enter at least one keyword.", None
39
 
40
  filtered_dataset = dataset.filter(
41
  lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
@@ -43,48 +21,79 @@ def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id
43
 
44
  # --- 3. Select Indices (Efficiently) ---
45
  matching_indices = []
 
46
  for i, example in enumerate(filtered_dataset):
47
  matching_indices.append(i)
48
-
 
 
 
 
 
 
 
49
 
50
  if not matching_indices:
51
- return "No matching examples found with the provided keywords.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # --- 4. Create the Subset Using .select() ---
54
  full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
55
- subset_dataset = full_dataset.select(matching_indices)
 
 
56
 
57
  # --- 5. Define features (for consistent schema) ---
58
  features = Features({
59
  'prompt': Value(dtype='string', id=None),
60
- 'audio': Audio(sampling_rate=16000), # Keep original sampling rate, adjust if needed
61
  'strategy': Value(dtype='string', id=None),
62
  'seed': Value(dtype='int64', id=None)
63
  })
64
 
65
  try:
66
- subset_dataset = subset_dataset.cast(features) # Cast to ensure features match
67
  except Exception as e:
68
- return f"An error occurred during casting please ensure that the dataset selected has the correct collumns: {e}", None
69
 
70
- # --- 6. Upload the Subset Dataset ---
71
- api = HfApi(token=hf_token)
72
 
73
- # Create a repository (if it doesn't exist)
 
74
  try:
75
  create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
76
  print(f"Repository '{new_dataset_repo_id}' created.")
77
  except Exception as e:
78
- if "Repo already exists" not in str(e):
79
  return f"Error creating repository: {e}", None
80
 
81
- # Upload to the Hugging Face Hub
82
- subset_dataset.push_to_hub(new_dataset_repo_id)
83
  dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
84
- return f"Subset dataset uploaded successfully! {len(matching_indices)} Examples Found", dataset_url
85
 
86
  except Exception as e:
87
- return f"An error occurred: {e}", None
88
 
89
 
90
  # --- Gradio Interface ---
@@ -92,26 +101,44 @@ with gr.Blocks() as demo:
92
  gr.Markdown("# Dataset Filter and Push")
93
 
94
  with gr.Row():
95
- dataset_name_input = gr.Textbox(label="Source Dataset Name (e.g., declare-lab/audio-alpaca)", value="declare-lab/audio-alpaca")
96
- split_name_input = gr.Textbox(label="Split Name (e.g., train)", value="train")
97
 
98
- keywords_input = gr.Textbox(label="Keywords (comma-separated, e.g., dog, cat, bird)", value="dog, cat, bird")
99
 
100
- with gr.Row():
101
- new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID (e.g., your_username/your_dataset)")
102
- hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
103
 
104
- submit_button = gr.Button("Filter and Push")
 
 
105
 
106
  with gr.Row():
107
- output_text = gr.Textbox(label="Status")
108
- dataset_output_link = gr.Textbox(label="Dataset URL")
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- submit_button.click(
112
- filter_and_push,
113
- inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
114
- outputs=[output_text, dataset_output_link],
 
115
  )
116
 
117
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  from datasets import load_dataset, Features, Value, Audio, Dataset
3
  from huggingface_hub import HfApi, create_repo
4
+ import pandas as pd # Import pandas for displaying the dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
6
 
7
+ def filter_dataset(dataset_name, split_name, keywords_text):
8
+ """Filters a dataset based on keywords and returns a Pandas DataFrame."""
9
  try:
10
  # --- 1. Load the dataset in streaming mode ---
11
  dataset = load_dataset(dataset_name, split=split_name, streaming=True)
12
 
13
  # --- 2. Filter the dataset (streaming) ---
 
14
  keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
15
  if not keywords:
16
+ return pd.DataFrame(), "Error: No keywords provided."
 
17
 
18
  filtered_dataset = dataset.filter(
19
  lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
 
21
 
22
  # --- 3. Select Indices (Efficiently) ---
23
  matching_indices = []
24
+ data_for_df = [] # Store data for DataFrame
25
  for i, example in enumerate(filtered_dataset):
26
  matching_indices.append(i)
27
+ # Extract data and append. Crucially, *decode* audio here.
28
+ example_data = {
29
+ 'prompt': example['prompt'],
30
+ 'strategy': example['strategy'],
31
+ 'seed': example['seed'],
32
+ 'audio': example['audio']['array'] # Get the NumPy array
33
+ }
34
+ data_for_df.append(example_data)
35
 
36
  if not matching_indices:
37
+ return pd.DataFrame(), "No matching examples found."
38
+
39
+ # --- 4. Create Pandas DataFrame ---
40
+ df = pd.DataFrame(data_for_df)
41
+ return df, f"Found {len(matching_indices)} matching examples."
42
+
43
+ except Exception as e:
44
+ return pd.DataFrame(), f"An error occurred: {e}"
45
+
46
+
47
+ def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
48
+ """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
49
+ if not hf_token:
50
+ return "Error: Hugging Face token is required.", None
51
+
52
+ try:
53
+ # Convert JSON back to DataFrame
54
+ df = pd.read_json(df_json)
55
+
56
+ if df.empty:
57
+ return "Error: Cannot push an empty dataset",None
58
+
59
+ # Convert DataFrame to Hugging Face Dataset
60
+ dataset = Dataset.from_pandas(df)
61
 
62
+ # --- Load original (for feature definition)
63
  full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
64
+
65
+ if len(full_dataset) == 0:
66
+ return "Error: Source Dataset Appears Empty",None
67
 
68
  # --- 5. Define features (for consistent schema) ---
69
  features = Features({
70
  'prompt': Value(dtype='string', id=None),
71
+ 'audio': Audio(sampling_rate=16000),
72
  'strategy': Value(dtype='string', id=None),
73
  'seed': Value(dtype='int64', id=None)
74
  })
75
 
76
  try:
77
+ dataset = dataset.cast(features)
78
  except Exception as e:
79
+ return f"An error occurred: {e}",None
80
 
 
 
81
 
82
+ # --- 6. Upload to the Hugging Face Hub ---
83
+ api = HfApi(token=hf_token)
84
  try:
85
  create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
86
  print(f"Repository '{new_dataset_repo_id}' created.")
87
  except Exception as e:
88
+ if "Repo already exists" not in str(e):
89
  return f"Error creating repository: {e}", None
90
 
91
+ dataset.push_to_hub(new_dataset_repo_id)
 
92
  dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
93
+ return f"Subset dataset uploaded successfully!", dataset_url
94
 
95
  except Exception as e:
96
+ return f"An error occurred during push: {e}", None
97
 
98
 
99
  # --- Gradio Interface ---
 
101
  gr.Markdown("# Dataset Filter and Push")
102
 
103
  with gr.Row():
104
+ dataset_name_input = gr.Textbox(label="Source Dataset Name", value="declare-lab/audio-alpaca")
105
+ split_name_input = gr.Textbox(label="Split Name", value="train")
106
 
107
+ keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
108
 
109
+ filter_button = gr.Button("Filter Dataset")
 
 
110
 
111
+ # Display the filtered data. 'label' is important for presentation.
112
+ filtered_data_output = gr.Dataframe(label="Filtered Data")
113
+ filter_status_output = gr.Textbox(label="Filter Status")
114
 
115
  with gr.Row():
116
+ new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
117
+ hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
118
 
119
+ push_button = gr.Button("Push to Hub")
120
+ push_status_output = gr.Textbox(label="Push Status")
121
+ dataset_url_output = gr.Textbox(label="Dataset URL") # Display the dataset URL
122
+
123
+ # Hidden component to store the filtered dataset (as JSON)
124
+ filtered_data_json = gr.JSON(visible=False)
125
+
126
+ # Connect the filter button
127
+ filter_button.click(
128
+ filter_dataset,
129
+ inputs=[dataset_name_input, split_name_input, keywords_input],
130
+ outputs=[filtered_data_output, filter_status_output]
131
+ ).then( # Use .then() to chain actions
132
+ lambda df: df.to_json(), # Convert DataFrame to JSON
133
+ inputs=[filtered_data_output],
134
+ outputs=[filtered_data_json] # Store in the hidden JSON component
135
+ )
136
 
137
+ # Connect the push button
138
+ push_button.click(
139
+ push_to_hub,
140
+ inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
141
+ outputs=[push_status_output, dataset_url_output]
142
  )
143
 
144
  if __name__ == "__main__":