import os import polars as pl import marimo __generated_with = "0.10.15" app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css") # ============================================================================= # Intro Cell # ============================================================================= @app.cell def introduction(mo): mo.md( r""" # Exploring a Hugging Face Dataset with Polars In this notebook we demonstrate how to: - **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern). - **Preview** the loaded DataFrame with metadata. - **Interactively expand** the DataFrame view. - Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data. **Prerequisites:** - Install dependencies via: ```bash pip install polars marimo ``` - Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable. ![Hugging Face logo](https://huggingface.co./front/assets/huggingface_logo.svg) """ ) return # ============================================================================= # Load HF_TOKEN from the environment # ============================================================================= @app.cell def load_token(mo): hf_token = os.environ.get("HF_TOKEN") mo.md(f""" **Hugging Face Token:** `{hf_token}` *(Ensure that HF_TOKEN is set in your environment.)* """) return # ============================================================================= # 1. Lazy-load the Dataset # ============================================================================= @app.cell def lazy_load_dataset(mo, pl): # Use a recursive globbing pattern to load all Parquet files from all subdirectories. dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet" @mo.lazy # The mo.lazy decorator defers execution until the data is needed. def load_dataset(): # Load all Parquet files matching the recursive pattern. df = pl.read_parquet(dataset_url) # --- Alternative for local JSONL files (uncomment if needed): # df = pl.read_ndjson("/local/path/to/*.jsonl") return df df = load_dataset() return df # ============================================================================= # 2. Preview the DataFrame with Metadata # ============================================================================= @app.cell def preview_data(mo, lazy_load_dataset, pl): df = lazy_load_dataset # LazyFrame returned by load_dataset preview = mo.ui.table(df.head(), metadata=True) mo.md( r""" ## Data Preview Below is a preview of the first few rows along with basic metadata. """ ) return preview # ============================================================================= # 3. Expand the DataFrame for Better Visualization # ============================================================================= @app.cell def expand_view(mo, lazy_load_dataset, pl): df = lazy_load_dataset expand_button = mo.ui.button(label="Expand Dataframe") @expand_button.on_click def on_expand(): mo.ui.table(df, width="100%", height="auto") mo.md( r""" ## Expand Dataframe Click the button below to expand the DataFrame view. """ ) return expand_button # ============================================================================= # 4. Column Selection Tips (as Markdown) # ============================================================================= @app.cell def column_selection_tips(mo): mo.md( r""" ## Column Selection Tips **Example 1: Select specific columns by name:** ```python selected_columns_df = df.select(["column1", "column2"]) ``` **Example 2: Select all columns except column 'a':** ```python all_except_a_df = df.select(pl.exclude("a")) ``` **Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):** ```python range_columns_df = df.select(pl.col(df.columns[1:4])) ``` """ ) return # ============================================================================= # Additional Polars I/O and DataFrame Examples (Markdown Cells) # ============================================================================= @app.cell def example_1(mo): mo.md( r""" ### Example 1: Eagerly Read a Single Parquet File ```python df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet") ``` """ ) return @app.cell def example_2(mo): mo.md( r""" ### Example 2: Read Multiple Parquet Files Using Globbing ```python df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet") ``` """ ) return @app.cell def example_3(mo): mo.md( r""" ### Example 3: Lazily Scan Parquet Files with Recursive Globbing ```python df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet") ``` """ ) return @app.cell def example_4(mo): mo.md( r""" ### Example 4: Read a JSON File into a DataFrame ```python df_json = pl.read_json("data/sample.json") ``` """ ) return @app.cell def example_5(mo): mo.md( r""" ### Example 5: Read JSON with a Specified Schema ```python schema = {"name": pl.Utf8, "age": pl.Int64} df_json = pl.read_json("data/sample.json", schema=schema) ``` """ ) return @app.cell def example_6(mo): mo.md( r""" ### Example 6: Write a DataFrame to NDJSON Format ```python df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]}) ndjson_str = df.write_ndjson() print(ndjson_str) ``` """ ) return @app.cell def example_7(mo): mo.md( r""" ### Example 7: Get the Schema of a Parquet File Without Reading Data ```python schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet") print(schema) ``` """ ) return @app.cell def example_8(mo): mo.md( r""" ### Example 8: Scan Parquet Files with Hive Partitioning Enabled ```python df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True) ``` """ ) return @app.cell def example_9(mo): mo.md( r""" ### Example 9: Lazily Scan NDJSON Files Using Globbing ```python df_lazy = pl.scan_ndjson("data/*.jsonl") ``` """ ) return @app.cell def example_10(mo): mo.md( r""" ### Example 10: Write a DataFrame to Partitioned Parquet Files ```python df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]}) df.write_parquet("output/", partition_by=["date"]) ``` """ ) return @app.cell def example_11(mo): mo.md( r""" ### Example 11: Read JSON with Custom Inference Length ```python df = pl.read_json("data/large_text.json", infer_schema_length=500) ``` """ ) return @app.cell def example_12(mo): mo.md( r""" ### Example 12: Read JSON with Schema Overrides ```python schema = {"id": pl.Int64, "text": pl.Utf8} overrides = {"id": pl.Int32} df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides) ``` """ ) return @app.cell def example_13(mo): mo.md( r""" ### Example 13: Write a DataFrame to NDJSON and Return as String ```python df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]}) ndjson_output = df.write_ndjson() print(ndjson_output) ``` """ ) return @app.cell def example_14(mo): mo.md( r""" ### Example 14: Scan Parquet Files with Cloud Storage Options ```python storage_options = {"token": os.environ.get("HF_TOKEN")} df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options) ``` """ ) return @app.cell def example_15(mo): mo.md( r""" ### Example 15: Scan NDJSON Files with Cloud Storage Options ```python storage_options = {"token": os.environ.get("HF_TOKEN")} df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options) ``` """ ) return @app.cell def example_16(mo): mo.md( r""" ### Example 16: Predicate Pushdown Example ```python df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") # Only load rows where 'value' > 100 df_filtered = df_lazy.filter(pl.col("value") > 100) result = df_filtered.collect() ``` """ ) return @app.cell def example_17(mo): mo.md( r""" ### Example 17: Projection Pushdown Example ```python df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") # Only select the 'text' and 'id' columns to reduce memory footprint df_proj = df_lazy.select(["id", "text"]) result = df_proj.collect() ``` """ ) return @app.cell def example_18(mo): mo.md( r""" ### Example 18: Collecting a Lazy DataFrame ```python df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") # Perform lazy operations... result = df_lazy.collect() print(result) ``` """ ) return @app.cell def example_19(mo): mo.md( r""" ### Example 19: Filtering on a Large Text Column ```python df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") # Filter rows where the 'text' column contains a long string pattern df_filtered = df.filter(pl.col("text").str.contains("important keyword")) print(df_filtered.head()) ``` """ ) return @app.cell def example_20(mo): mo.md( r""" ### Example 20: Using String Length on a Text Column ```python df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") # Compute the length of text in the 'text' column df = df.with_columns(text_length=pl.col("text").str.len()) print(df.head()) ``` """ ) return @app.cell def example_21(mo): mo.md( r""" ### Example 21: Grouping by a Large Text Field ```python df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length")) print(grouped.collect()) ``` """ ) return @app.cell def example_22(mo): mo.md( r""" ### Example 22: Joining Two DataFrames on a Common Key ```python df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]}) df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]}) joined = df1.join(df2, on="id") print(joined) ``` """ ) return @app.cell def example_23(mo): mo.md( r""" ### Example 23: Using join_asof for Time-based Joins ```python df1 = pl.DataFrame({ "time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"), "text": ["sample text"] * 25 }) df2 = pl.DataFrame({ "time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"), "value": list(range(25)) }) # Perform an asof join to match the nearest timestamp joined = df1.sort("time").join_asof(df2.sort("time"), on="time") print(joined) ``` """ ) return @app.cell def example_24(mo): mo.md( r""" ### Example 24: Reading a Parquet File with Low Memory Option ```python df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True) print(df.head()) ``` """ ) return @app.cell def example_25(mo): mo.md( r""" ### Example 25: Scanning Parquet Files with a Parallel Strategy ```python df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto") result = df_lazy.collect() print(result) ``` """ ) return @app.cell def example_26(mo): mo.md( r""" ### Example 26: Reading a Large JSON File into a DataFrame ```python df = pl.read_json("data/large_text.json", infer_schema_length=200) print(df.head()) ``` """ ) return @app.cell def example_27(mo): mo.md( r""" ### Example 27: Using DataFrame.head() on a Large Text Dataset ```python df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") print(df.head(10)) ``` """ ) return @app.cell def example_28(mo): mo.md( r""" ### Example 28: Using DataFrame.tail() on a Large Text Dataset ```python df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") print(df.tail(10)) ``` """ ) return @app.cell def example_29(mo): mo.md( r""" ### Example 29: Scanning NDJSON Files with Rechunking ```python df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True) result = df_lazy.collect() print(result) ``` """ ) return @app.cell def example_30(mo): mo.md( r""" ### Example 30: Scanning Parquet Files with Allowing Missing Columns ```python df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True) result = df_lazy.collect() print(result) ``` """ ) return # ============================================================================= # End of Notebook # ============================================================================= @app.cell def conclusion(mo): mo.md( r""" # Conclusion This notebook showcased: - How to lazy-load a Hugging Face dataset using Polars with recursive globbing. - How to preview and interactively expand the DataFrame. - Over 30 examples covering various Polars I/O functions and DataFrame operations, which are especially useful when working with large text data. For more information, please refer to: - [Polars Documentation](https://docs.pola.rs/) - [Hugging Face Hub Documentation](https://huggingface.co./docs) - [Marimo Notebook Documentation](https://marimo.io/) Happy Data Exploring! """ ) return if __name__ == "__main__": app.run()