lhoestq HF staff commited on
Commit
9a96811
·
1 Parent(s): 8a9db0e
Files changed (3) hide show
  1. app.py +92 -20
  2. requirements.txt +1 -0
  3. text_functions.tsv +82 -0
app.py CHANGED
@@ -1,9 +1,14 @@
 
 
 
1
  import gradio as gr
 
2
  import requests
3
  from huggingface_hub import HfApi
4
 
5
- session = requests.Session()
6
-
 
7
  css = """
8
  @media (prefers-color-scheme: dark) {
9
  .transparent-dropdown, .transparent-dropdown .container .wrap {
@@ -15,23 +20,63 @@ css = """
15
  background: var(--bg);
16
  }
17
  }
 
 
 
 
 
 
 
 
 
 
 
 
18
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- with gr.Blocks(css=css) as demo:
21
- with gr.Row():
22
- with gr.Column(scale=4):
23
- with gr.Group():
24
- dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, interactive=True)
25
- with gr.Row():
26
- subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False, elem_classes="transparent-dropdown")
27
- split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False, elem_classes="transparent-dropdown")
28
- gr.LoginButton()
 
29
  loading_codes_json = gr.JSON(visible=False)
30
  dataset_subset_split_textbox = gr.Textbox(visible=False)
31
- dataframe = gr.DataFrame()
 
 
 
 
 
 
 
 
 
 
32
 
33
  @demo.load(outputs=dataset_dropdown)
34
- def fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
35
  api = HfApi(token=oauth_token.token if oauth_token else None)
36
  datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
37
  if oauth_token and (user := api.whoami().get("user")):
@@ -40,14 +85,14 @@ with gr.Blocks(css=css) as demo:
40
  return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
41
 
42
  @dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
43
- def fetch_read_parquet_loading(dataset: str):
44
- if "/" not in dataset.strip().strip("/"):
45
  return []
46
- resp = session.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
47
- return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] == "dd.read_parquet"] or [[]])[0] or []
48
 
49
  @loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
50
- def show_subset_dropdown(loading_codes: list[dict]):
51
  subsets = [loading_code["config_name"] for loading_code in loading_codes]
52
  subset = (subsets or [""])[0]
53
  splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
@@ -55,11 +100,38 @@ with gr.Blocks(css=css) as demo:
55
  return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
56
 
57
  @subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
58
- def show_split_dropdown(loading_codes: list[dict], subset: str):
59
  splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
60
  split = (splits or [""])[0]
61
  return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
62
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  if __name__ == "__main__":
 
1
+ from functools import partial, lru_cache
2
+
3
+ import duckdb
4
  import gradio as gr
5
+ import pandas as pd
6
  import requests
7
  from huggingface_hub import HfApi
8
 
9
+ READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
10
+ EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10)
11
+ MAX_NUM_COLUMNS = 20
12
  css = """
13
  @media (prefers-color-scheme: dark) {
14
  .transparent-dropdown, .transparent-dropdown .container .wrap {
 
20
  background: var(--bg);
21
  }
22
  }
23
+ input {
24
+ -webkit-user-select: none;
25
+ -moz-user-select: none;
26
+ -ms-user-select: none;
27
+ user-select: none;
28
+ }
29
+ .cell-menu-button {
30
+ z-index: -1;
31
+ }
32
+ thead {
33
+ display: none;
34
+ }
35
  """
36
+ js = """
37
+ function setDataFrameReadonly() {
38
+ MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
39
+ var observer = new MutationObserver(function(mutations, observer) {
40
+ // fired when a mutation occurs
41
+ document.querySelectorAll('.readonly-dataframe div .table-wrap button svelte-virtual-table-viewport table tbody tr td .cell-wrap input').forEach(i => i.setAttribute("readonly", "true"));
42
+ });
43
+ // define what element should be observed by the observer
44
+ // and what types of mutations trigger the callback
45
+ observer.observe(document, {
46
+ subtree: true,
47
+ childList: true
48
+ });
49
+
50
+ }
51
+ """
52
+ text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
53
 
54
+ def prepare_function(func: str, placeholder: str, column_name: str) -> str:
55
+ if "(" in func:
56
+ prepared_func = func.split("(")
57
+ prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1)
58
+ prepared_func = "(".join(prepared_func)
59
+ else:
60
+ prepared_func = func.replace(placeholder, column_name, 1)
61
+ return prepared_func
62
+
63
+ with gr.Blocks(css=css, js=js) as demo:
64
  loading_codes_json = gr.JSON(visible=False)
65
  dataset_subset_split_textbox = gr.Textbox(visible=False)
66
+ input_dataframe = gr.DataFrame(visible=False)
67
+ with gr.Group():
68
+ with gr.Row():
69
+ dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10)
70
+ subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
71
+ split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
72
+ gr.LoginButton()
73
+ with gr.Row():
74
+ transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns]
75
+ transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
76
+ dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
77
 
78
  @demo.load(outputs=dataset_dropdown)
79
+ def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
80
  api = HfApi(token=oauth_token.token if oauth_token else None)
81
  datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
82
  if oauth_token and (user := api.whoami().get("user")):
 
85
  return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
86
 
87
  @dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
88
+ def _fetch_read_parquet_loading(dataset: str):
89
+ if dataset and "/" not in dataset.strip().strip("/"):
90
  return []
91
+ resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
92
+ return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
93
 
94
  @loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
95
+ def _show_subset_dropdown(loading_codes: list[dict]):
96
  subsets = [loading_code["config_name"] for loading_code in loading_codes]
97
  subset = (subsets or [""])[0]
98
  splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
 
100
  return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
101
 
102
  @subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
103
+ def _show_split_dropdown(loading_codes: list[dict], subset: str):
104
  splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
105
  split = (splits or [""])[0]
106
  return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
107
+
108
+ @split_dropdown.change(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=input_dataframe)
109
+ @lru_cache(maxsize=3)
110
+ def _set_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
111
+ pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
112
+ if dataset and subset and split and pattern:
113
+ df = duckdb.sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
114
+ return gr.DataFrame(df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))
115
+ else:
116
+ return gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns))
117
+
118
+ @input_dataframe.change(inputs=input_dataframe, outputs=transform_dropdowns)
119
+ def _set_transforms(input_df: pd.DataFrame):
120
+ new_transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
121
+ new_transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
122
+ return new_transform_dropdowns
123
+
124
+ def _set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
125
+ try:
126
+ print(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
127
+ # return input_df
128
+ return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
129
+ except Exception as e:
130
+ raise gr.Error(f"{type(e).__name__}: {e}")
131
+
132
+ for column_index, transform_dropdown in enumerate(transform_dropdowns):
133
+ transform_dropdown.change(partial(_set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
134
+
135
 
136
 
137
  if __name__ == "__main__":
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ duckdb
text_functions.tsv ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name Description
2
+ string ^@ search_string Return true if string begins with search_string.
3
+ string || string Concatenate two strings. Any NULL input results in NULL. See also concat(string, ...).
4
+ string[index] Extract a single character using a (1-based) index.
5
+ string[begin:end] Extract a string using slice conventions, see slicing.
6
+ string LIKE target Returns true if the string matches the like specifier (see Pattern Matching).
7
+ string SIMILAR TO regex Returns true if the string matches the regex; identical to regexp_full_match (see Pattern Matching).
8
+ array_extract(list, index) Extract a single character using a (1-based) index.
9
+ array_slice(list, begin, end) Extract a string using slice conventions. Negative values are accepted.
10
+ ascii(string) Returns an integer that represents the Unicode code point of the first character of the string.
11
+ bar(x, min, max[, width]) Draw a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
12
+ bit_length(string) Number of bits in a string.
13
+ chr(x) Returns a character which is corresponding the ASCII code value or Unicode code point.
14
+ concat_ws(separator, string, ...) Concatenate many strings, separated by separator. NULL inputs are skipped.
15
+ concat(string, ...) Concatenate many strings. NULL inputs are skipped. See also string || string.
16
+ contains(string, search_string) Return true if search_string is found within string.
17
+ ends_with(string, search_string) Return true if string ends with search_string.
18
+ format_bytes(bytes) Converts bytes to a human-readable representation using units based on powers of 2 (KiB, MiB, GiB, etc.).
19
+ format(format, parameters, ...) Formats a string using the fmt syntax.
20
+ from_base64(string) Convert a base64 encoded string to a character string.
21
+ greatest(x1, x2, ...) Selects the largest value using lexicographical ordering. Note that lowercase characters are considered “larger” than uppercase characters and collations are not supported.
22
+ hash(value) Returns a UBIGINT with the hash of the value.
23
+ ilike_escape(string, like_specifier, escape_character) Returns true if the string matches the like_specifier (see Pattern Matching) using case-insensitive matching. escape_character is used to search for wildcard characters in the string.
24
+ instr(string, search_string) Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
25
+ least(x1, x2, ...) Selects the smallest value using lexicographical ordering. Note that uppercase characters are considered “smaller” than lowercase characters, and collations are not supported.
26
+ left_grapheme(string, count) Extract the left-most grapheme clusters.
27
+ left(string, count) Extract the left-most count characters.
28
+ length_grapheme(string) Number of grapheme clusters in string.
29
+ length(string) Number of characters in string.
30
+ like_escape(string, like_specifier, escape_character) Returns true if the string matches the like_specifier (see Pattern Matching) using case-sensitive matching. escape_character is used to search for wildcard characters in the string.
31
+ lower(string) Convert string to lower case.
32
+ lpad(string, count, character) Pads the string with the character from the left until it has count characters.
33
+ ltrim(string, characters) Removes any occurrences of any of the characters from the left side of the string.
34
+ ltrim(string) Removes any spaces from the left side of the string.
35
+ md5(string) Returns the MD5 hash of the string as a VARCHAR.
36
+ md5_number(string) Returns the MD5 hash of the string as a HUGEINT.
37
+ md5_number_lower(string) Returns the lower 64-bit segment of the MD5 hash of the string as a BIGINT.
38
+ md5_number_higher(string) Returns the higher 64-bit segment of the MD5 hash of the string as a BIGINT.
39
+ nfc_normalize(string) Convert string to Unicode NFC normalized string. Useful for comparisons and ordering if text data is mixed between NFC normalized and not.
40
+ not_ilike_escape(string, like_specifier, escape_character) Returns false if the string matches the like_specifier (see Pattern Matching) using case-sensitive matching. escape_character is used to search for wildcard characters in the string.
41
+ not_like_escape(string, like_specifier, escape_character) Returns false if the string matches the like_specifier (see Pattern Matching) using case-insensitive matching. escape_character is used to search for wildcard characters in the string.
42
+ ord(string) Return ASCII character code of the leftmost character in a string.
43
+ parse_dirname(path, separator) Returns the top-level directory name from the given path. separator options: system, both_slash (default), forward_slash, backslash.
44
+ parse_dirpath(path, separator) Returns the head of the path (the pathname until the last slash) similarly to Python's os.path.dirname function. separator options: system, both_slash (default), forward_slash, backslash.
45
+ parse_filename(path, trim_extension, separator) Returns the last component of the path similarly to Python's os.path.basename function. If trim_extension is true, the file extension will be removed (defaults to false). separator options: system, both_slash (default), forward_slash, backslash.
46
+ parse_path(path, separator) Returns a list of the components (directories and filename) in the path similarly to Python's pathlib.parts function. separator options: system, both_slash (default), forward_slash, backslash.
47
+ position(search_string IN string) Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
48
+ printf(format, parameters...) Formats a string using printf syntax.
49
+ read_text(source) Returns the content from source (a filename, a list of filenames, or a glob pattern) as a VARCHAR. The file content is first validated to be valid UTF-8. If read_text attempts to read a file with invalid UTF-8 an error is thrown suggesting to use read_blob instead. See the read_text guide for more details.
50
+ regexp_escape(string) Escapes special patterns to turn string into a regular expression similarly to Python's re.escape function.
51
+ regexp_extract(string, pattern[, group = 0]) If string contains the regexp pattern, returns the capturing group specified by optional parameter group (see Pattern Matching).
52
+ regexp_extract(string, pattern, name_list) If string contains the regexp pattern, returns the capturing groups as a struct with corresponding names from name_list (see Pattern Matching).
53
+ regexp_extract_all(string, regex[, group = 0]) Split the string along the regex and extract all occurrences of group.
54
+ regexp_full_match(string, regex) Returns true if the entire string matches the regex (see Pattern Matching).
55
+ regexp_matches(string, pattern) Returns true if string contains the regexp pattern, false otherwise (see Pattern Matching).
56
+ regexp_replace(string, pattern, replacement) If string contains the regexp pattern, replaces the matching part with replacement (see Pattern Matching).
57
+ regexp_split_to_array(string, regex) Splits the string along the regex.
58
+ regexp_split_to_table(string, regex) Splits the string along the regex and returns a row for each part.
59
+ repeat(string, count) Repeats the string count number of times.
60
+ replace(string, source, target) Replaces any occurrences of the source with target in string.
61
+ reverse(string) Reverses the string.
62
+ right_grapheme(string, count) Extract the right-most count grapheme clusters.
63
+ right(string, count) Extract the right-most count characters.
64
+ rpad(string, count, character) Pads the string with the character from the right until it has count characters.
65
+ rtrim(string, characters) Removes any occurrences of any of the characters from the right side of the string.
66
+ rtrim(string) Removes any spaces from the right side of the string.
67
+ sha256(value) Returns a VARCHAR with the SHA-256 hash of the value.
68
+ split_part(string, separator, index) Split the string along the separator and return the data at the (1-based) index of the list. If the index is outside the bounds of the list, return an empty string (to match PostgreSQL's behavior).
69
+ starts_with(string, search_string) Return true if string begins with search_string.
70
+ str_split_regex(string, regex) Splits the string along the regex.
71
+ string_split_regex(string, regex) Splits the string along the regex.
72
+ string_split(string, separator) Splits the string along the separator.
73
+ strip_accents(string) Strips accents from string.
74
+ strlen(string) Number of bytes in string.
75
+ strpos(string, search_string) Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
76
+ substring(string, start, length) Extract substring of length characters starting from character start. Note that a start value of 1 refers to the first character of the string.
77
+ substring_grapheme(string, start, length) Extract substring of length grapheme clusters starting from character start. Note that a start value of 1 refers to the first character of the string.
78
+ to_base64(blob) Convert a blob to a base64 encoded string.
79
+ trim(string, characters) Removes any occurrences of any of the characters from either side of the string.
80
+ trim(string) Removes any spaces from either side of the string.
81
+ unicode(string) Returns the Unicode code of the first character of the string.
82
+ upper(string) Convert string to upper case.