lhoestq HF staff commited on
Commit
bf29377
·
1 Parent(s): 9a96811
Files changed (1) hide show
  1. app.py +75 -40
app.py CHANGED
@@ -51,6 +51,10 @@ function setDataFrameReadonly() {
51
  """
52
  text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
53
 
 
 
 
 
54
  def prepare_function(func: str, placeholder: str, column_name: str) -> str:
55
  if "(" in func:
56
  prepared_func = func.split("(")
@@ -75,63 +79,94 @@ with gr.Blocks(css=css, js=js) as demo:
75
  transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
76
  dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
77
 
78
- @demo.load(outputs=dataset_dropdown)
79
- def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
80
- api = HfApi(token=oauth_token.token if oauth_token else None)
81
- datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
82
- if oauth_token and (user := api.whoami().get("user")):
83
- datasets += list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
84
- dataset = request.query_params.get("dataset") or datasets[0].id
85
- return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
86
-
87
- @dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
88
- def _fetch_read_parquet_loading(dataset: str):
89
  if dataset and "/" not in dataset.strip().strip("/"):
90
  return []
91
  resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
92
- return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
93
-
94
- @loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
95
- def _show_subset_dropdown(loading_codes: list[dict]):
96
  subsets = [loading_code["config_name"] for loading_code in loading_codes]
97
  subset = (subsets or [""])[0]
98
- splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
99
- split = (splits or [""])[0]
100
- return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
101
 
102
- @subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
103
- def _show_split_dropdown(loading_codes: list[dict], subset: str):
104
  splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
105
  split = (splits or [""])[0]
106
- return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
107
-
108
- @split_dropdown.change(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=input_dataframe)
109
- @lru_cache(maxsize=3)
110
- def _set_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
111
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
112
  if dataset and subset and split and pattern:
113
- df = duckdb.sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
114
- return gr.DataFrame(df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))
115
  else:
116
- return gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns))
117
-
118
- @input_dataframe.change(inputs=input_dataframe, outputs=transform_dropdowns)
119
- def _set_transforms(input_df: pd.DataFrame):
120
- new_transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
121
- new_transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
122
- return new_transform_dropdowns
123
 
124
- def _set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
125
  try:
126
- print(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
127
- # return input_df
128
- return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
129
  except Exception as e:
130
- raise gr.Error(f"{type(e).__name__}: {e}")
 
131
 
132
  for column_index, transform_dropdown in enumerate(transform_dropdowns):
133
- transform_dropdown.change(partial(_set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
  if __name__ == "__main__":
 
51
  """
52
  text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
53
 
54
+ @lru_cache(maxsize=3)
55
+ def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
56
+ return duckdb.sql(query)
57
+
58
  def prepare_function(func: str, placeholder: str, column_name: str) -> str:
59
  if "(" in func:
60
  prepared_func = func.split("(")
 
79
  transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
80
  dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
81
 
82
+ def show_subset_dropdown(dataset: str):
 
 
 
 
 
 
 
 
 
 
83
  if dataset and "/" not in dataset.strip().strip("/"):
84
  return []
85
  resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
86
+ loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
 
 
 
87
  subsets = [loading_code["config_name"] for loading_code in loading_codes]
88
  subset = (subsets or [""])[0]
89
+ return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
 
 
90
 
91
+ def show_split_dropdown(subset: str, loading_codes: list[dict]):
 
92
  splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
93
  split = (splits or [""])[0]
94
+ return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
95
+
96
+ def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
 
 
97
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
98
  if dataset and subset and split and pattern:
99
+ df = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
100
+ input_df = df
101
  else:
102
+ input_df = EMPTY_DF
103
+ new_transform_dropdowns = [dict(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
104
+ new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
105
+ return [dict(value=df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns
 
 
 
106
 
107
+ def set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
108
  try:
109
+ return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;").df()
 
 
110
  except Exception as e:
111
+ gr.Error(f"{type(e).__name__}: {e}")
112
+ return input_df
113
 
114
  for column_index, transform_dropdown in enumerate(transform_dropdowns):
115
+ transform_dropdown.select(partial(set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
116
 
117
+ @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
118
+ def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
119
+ api = HfApi(token=oauth_token.token if oauth_token else None)
120
+ datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
121
+ if oauth_token and (user := api.whoami().get("name")):
122
+ datasets += list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
123
+ dataset = request.query_params.get("dataset") or datasets[0].id
124
+ subsets, loading_codes = show_subset_dropdown(dataset)
125
+ splits = show_split_dropdown(subsets["value"], loading_codes)
126
+ input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
127
+ return {
128
+ dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
129
+ loading_codes_json: loading_codes,
130
+ subset_dropdown: gr.Dropdown(**subsets),
131
+ split_dropdown: gr.Dropdown(**splits),
132
+ input_dataframe: gr.DataFrame(**input_df),
133
+ dataframe: gr.DataFrame(**input_df),
134
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
135
+ }
136
+
137
+ @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
138
+ def _show_subset_dropdown(dataset: str):
139
+ subsets, loading_codes = show_subset_dropdown(dataset)
140
+ splits = show_split_dropdown(subsets["value"], loading_codes)
141
+ input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
142
+ return {
143
+ loading_codes_json: loading_codes,
144
+ subset_dropdown: gr.Dropdown(**subsets),
145
+ split_dropdown: gr.Dropdown(**splits),
146
+ input_dataframe: gr.DataFrame(**input_df),
147
+ dataframe: gr.DataFrame(**input_df),
148
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
149
+ }
150
+
151
+ @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
152
+ def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
153
+ splits = show_split_dropdown(subset, loading_codes)
154
+ input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
155
+ return {
156
+ split_dropdown: gr.Dropdown(**splits),
157
+ input_dataframe: gr.DataFrame(**input_df),
158
+ dataframe: gr.DataFrame(**input_df),
159
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
160
+ }
161
+
162
+ @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_dataframe, dataframe] + transform_dropdowns)
163
+ def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
164
+ input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes)
165
+ return {
166
+ input_dataframe: gr.DataFrame(**input_df),
167
+ dataframe: gr.DataFrame(**input_df),
168
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
169
+ }
170
 
171
 
172
  if __name__ == "__main__":