Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from datasets import load_dataset, Features, Value, Audio, Dataset
|
3 |
from huggingface_hub import HfApi, create_repo
|
4 |
-
import pandas as pd
|
5 |
|
6 |
|
7 |
def filter_dataset(dataset_name, split_name, keywords_text):
|
@@ -24,12 +24,11 @@ def filter_dataset(dataset_name, split_name, keywords_text):
|
|
24 |
data_for_df = [] # Store data for DataFrame
|
25 |
for i, example in enumerate(filtered_dataset):
|
26 |
matching_indices.append(i)
|
27 |
-
# Extract data and append.
|
28 |
example_data = {
|
29 |
-
'prompt': example
|
30 |
-
'
|
31 |
-
'
|
32 |
-
'audio': example['audio']['array'] # Get the NumPy array
|
33 |
}
|
34 |
data_for_df.append(example_data)
|
35 |
|
@@ -47,37 +46,32 @@ def filter_dataset(dataset_name, split_name, keywords_text):
|
|
47 |
def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
|
48 |
"""Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
|
49 |
if not hf_token:
|
50 |
-
return "Error: Hugging Face
|
51 |
|
52 |
try:
|
53 |
# Convert JSON back to DataFrame
|
54 |
df = pd.read_json(df_json)
|
55 |
-
|
56 |
if df.empty:
|
57 |
-
return "Error: Cannot push an empty dataset",None
|
58 |
-
|
59 |
# Convert DataFrame to Hugging Face Dataset
|
60 |
dataset = Dataset.from_pandas(df)
|
61 |
|
62 |
-
# --- Load original (for feature definition)
|
63 |
-
full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
|
64 |
-
|
65 |
-
if len(full_dataset) == 0:
|
66 |
-
return "Error: Source Dataset Appears Empty",None
|
67 |
|
68 |
# --- 5. Define features (for consistent schema) ---
|
69 |
-
|
70 |
'prompt': Value(dtype='string', id=None),
|
71 |
-
'
|
72 |
-
'
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
76 |
try:
|
77 |
-
|
78 |
except Exception as e:
|
79 |
-
return f"An error occurred: {e}",None
|
80 |
-
|
81 |
|
82 |
# --- 6. Upload to the Hugging Face Hub ---
|
83 |
api = HfApi(token=hf_token)
|
@@ -94,14 +88,12 @@ def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token
|
|
94 |
|
95 |
except Exception as e:
|
96 |
return f"An error occurred during push: {e}", None
|
97 |
-
|
98 |
-
|
99 |
# --- Gradio Interface ---
|
100 |
with gr.Blocks() as demo:
|
101 |
gr.Markdown("# Dataset Filter and Push")
|
102 |
|
103 |
with gr.Row():
|
104 |
-
dataset_name_input = gr.Textbox(label="Source Dataset Name", value="
|
105 |
split_name_input = gr.Textbox(label="Split Name", value="train")
|
106 |
|
107 |
keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
|
|
|
1 |
import gradio as gr
|
2 |
from datasets import load_dataset, Features, Value, Audio, Dataset
|
3 |
from huggingface_hub import HfApi, create_repo
|
4 |
+
import pandas as pd
|
5 |
|
6 |
|
7 |
def filter_dataset(dataset_name, split_name, keywords_text):
|
|
|
24 |
data_for_df = [] # Store data for DataFrame
|
25 |
for i, example in enumerate(filtered_dataset):
|
26 |
matching_indices.append(i)
|
27 |
+
# Extract data and append. Handle potential KeyErrors.
|
28 |
example_data = {
|
29 |
+
'prompt': example.get('prompt', None), # Use .get() for safety
|
30 |
+
'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
|
31 |
+
'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None, # Handle nested structure
|
|
|
32 |
}
|
33 |
data_for_df.append(example_data)
|
34 |
|
|
|
46 |
def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
|
47 |
"""Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
|
48 |
if not hf_token:
|
49 |
+
return "Error: Hugging Face Token is required.", None
|
50 |
|
51 |
try:
|
52 |
# Convert JSON back to DataFrame
|
53 |
df = pd.read_json(df_json)
|
54 |
+
|
55 |
if df.empty:
|
56 |
+
return "Error: Cannot push an empty dataset", None
|
57 |
+
|
58 |
# Convert DataFrame to Hugging Face Dataset
|
59 |
dataset = Dataset.from_pandas(df)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# --- 5. Define features (for consistent schema) ---
|
63 |
+
features_dict = {
|
64 |
'prompt': Value(dtype='string', id=None),
|
65 |
+
'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
|
66 |
+
'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
|
67 |
+
}
|
68 |
+
|
69 |
+
features = Features(features_dict)
|
70 |
+
|
71 |
try:
|
72 |
+
dataset = dataset.cast(features)
|
73 |
except Exception as e:
|
74 |
+
return f"An error occurred during casting: {e}", None
|
|
|
75 |
|
76 |
# --- 6. Upload to the Hugging Face Hub ---
|
77 |
api = HfApi(token=hf_token)
|
|
|
88 |
|
89 |
except Exception as e:
|
90 |
return f"An error occurred during push: {e}", None
|
|
|
|
|
91 |
# --- Gradio Interface ---
|
92 |
with gr.Blocks() as demo:
|
93 |
gr.Markdown("# Dataset Filter and Push")
|
94 |
|
95 |
with gr.Row():
|
96 |
+
dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
|
97 |
split_name_input = gr.Textbox(label="Split Name", value="train")
|
98 |
|
99 |
keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
|