Bradarr commited on
Commit
a6596f7
·
verified ·
1 Parent(s): 68a532a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -27
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from datasets import load_dataset, Features, Value, Audio, Dataset
3
  from huggingface_hub import HfApi, create_repo
4
- import pandas as pd # Import pandas for displaying the dataset
5
 
6
 
7
  def filter_dataset(dataset_name, split_name, keywords_text):
@@ -24,12 +24,11 @@ def filter_dataset(dataset_name, split_name, keywords_text):
24
  data_for_df = [] # Store data for DataFrame
25
  for i, example in enumerate(filtered_dataset):
26
  matching_indices.append(i)
27
- # Extract data and append. Crucially, *decode* audio here.
28
  example_data = {
29
- 'prompt': example['prompt'],
30
- 'strategy': example['strategy'],
31
- 'seed': example['seed'],
32
- 'audio': example['audio']['array'] # Get the NumPy array
33
  }
34
  data_for_df.append(example_data)
35
 
@@ -47,37 +46,32 @@ def filter_dataset(dataset_name, split_name, keywords_text):
47
  def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
48
  """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
49
  if not hf_token:
50
- return "Error: Hugging Face token is required.", None
51
 
52
  try:
53
  # Convert JSON back to DataFrame
54
  df = pd.read_json(df_json)
55
-
56
  if df.empty:
57
- return "Error: Cannot push an empty dataset",None
58
-
59
  # Convert DataFrame to Hugging Face Dataset
60
  dataset = Dataset.from_pandas(df)
61
 
62
- # --- Load original (for feature definition)
63
- full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
64
-
65
- if len(full_dataset) == 0:
66
- return "Error: Source Dataset Appears Empty",None
67
 
68
  # --- 5. Define features (for consistent schema) ---
69
- features = Features({
70
  'prompt': Value(dtype='string', id=None),
71
- 'audio': Audio(sampling_rate=16000),
72
- 'strategy': Value(dtype='string', id=None),
73
- 'seed': Value(dtype='int64', id=None)
74
- })
75
-
 
76
  try:
77
- dataset = dataset.cast(features)
78
  except Exception as e:
79
- return f"An error occurred: {e}",None
80
-
81
 
82
  # --- 6. Upload to the Hugging Face Hub ---
83
  api = HfApi(token=hf_token)
@@ -94,14 +88,12 @@ def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token
94
 
95
  except Exception as e:
96
  return f"An error occurred during push: {e}", None
97
-
98
-
99
  # --- Gradio Interface ---
100
  with gr.Blocks() as demo:
101
  gr.Markdown("# Dataset Filter and Push")
102
 
103
  with gr.Row():
104
- dataset_name_input = gr.Textbox(label="Source Dataset Name", value="declare-lab/audio-alpaca")
105
  split_name_input = gr.Textbox(label="Split Name", value="train")
106
 
107
  keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
 
1
  import gradio as gr
2
  from datasets import load_dataset, Features, Value, Audio, Dataset
3
  from huggingface_hub import HfApi, create_repo
4
+ import pandas as pd
5
 
6
 
7
  def filter_dataset(dataset_name, split_name, keywords_text):
 
24
  data_for_df = [] # Store data for DataFrame
25
  for i, example in enumerate(filtered_dataset):
26
  matching_indices.append(i)
27
+ # Extract data and append. Handle potential KeyErrors.
28
  example_data = {
29
+ 'prompt': example.get('prompt', None), # Use .get() for safety
30
+ 'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
31
+ 'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None, # Handle nested structure
 
32
  }
33
  data_for_df.append(example_data)
34
 
 
46
  def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
47
  """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
48
  if not hf_token:
49
+ return "Error: Hugging Face Token is required.", None
50
 
51
  try:
52
  # Convert JSON back to DataFrame
53
  df = pd.read_json(df_json)
54
+
55
  if df.empty:
56
+ return "Error: Cannot push an empty dataset", None
57
+
58
  # Convert DataFrame to Hugging Face Dataset
59
  dataset = Dataset.from_pandas(df)
60
 
 
 
 
 
 
61
 
62
  # --- 5. Define features (for consistent schema) ---
63
+ features_dict = {
64
  'prompt': Value(dtype='string', id=None),
65
+ 'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
66
+ 'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
67
+ }
68
+
69
+ features = Features(features_dict)
70
+
71
  try:
72
+ dataset = dataset.cast(features)
73
  except Exception as e:
74
+ return f"An error occurred during casting: {e}", None
 
75
 
76
  # --- 6. Upload to the Hugging Face Hub ---
77
  api = HfApi(token=hf_token)
 
88
 
89
  except Exception as e:
90
  return f"An error occurred during push: {e}", None
 
 
91
  # --- Gradio Interface ---
92
  with gr.Blocks() as demo:
93
  gr.Markdown("# Dataset Filter and Push")
94
 
95
  with gr.Row():
96
+ dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
97
  split_name_input = gr.Textbox(label="Split Name", value="train")
98
 
99
  keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")