preview_dataset

Sleeping

App Files Files Community

Jeremy Leibs commited on Apr 25, 2024

Commit

a272843

unverified ·

2 Parent(s): 5245583 6c0bd0b

Merge pull request #1 from rerun-io/jleibs/hugging_face_deploy

Browse files

Files changed (8) hide show

.gitignore +3 -0
Dockerfile +29 -0
README.md +32 -4
app.py +90 -0
dataset_conversion.py +58 -0
lychee.toml +1 -0
main.py +7 -54
requirements.txt +2 -0

.gitignore CHANGED Viewed

@@ -15,3 +15,6 @@ target_wasm
 # Pixi environment
 .pixi
 .ruff_cache

 # Pixi environment
 .pixi
 .ruff_cache
+tmp/**
+venv/**

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# This Dockerfile is used for creating the Hugging Face docker space
+# See: https://huggingface.co/docs/hub/en/spaces-sdks-docker
+FROM python:3.11.8
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
+RUN pip install --no-cache-dir --upgrade pip
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# Install requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Start the FastAPI app on port 7860, the default port expected by Spaces
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,5 +1,16 @@
-# Rerun visualization of HuggingFace datasets
-Visualize HuggingFace datasets using [Rerun](https://www.rerun.io/).
 Originally built for the LeRobot datasets:
@@ -8,8 +19,9 @@ Originally built for the LeRobot datasets:
 https://github.com/rerun-io/python-example-lerobot/assets/1148717/19e9983c-531f-4c48-9b37-37c5cbe1e0bd
-## Getting started
 Requires Python 3.10 or higher.
 ```sh
@@ -17,7 +29,12 @@ pip install -r requirements.txt
 python main.py --dataset lerobot/aloha_sim_insertion_human
 ```
-Example datasets to explore:
 * `lerobot/aloha_sim_insertion_human`
 * `lerobot/aloha_sim_insertion_scripted`
 * `lerobot/aloha_sim_transfer_cube_human`
@@ -27,5 +44,16 @@ Example datasets to explore:
 * `nateraw/kitti`
 * `sayakpaul/nyu_depth_v2`
 ## Note for the maintainer
 You can update this repository with the latest changes from https://github.com/rerun-io/rerun_template by running `scripts/template_update.py update --languages python`.

+---
+title: Preview Dataset
+emoji: 👀
+colorFrom: yellow
+colorTo: yellow
+sdk: docker
+app_port: 7860
+pinned: false
+license: mit
+---
+# Rerun visualization of Hugging Face datasets
+Visualize Hugging Face datasets using [Rerun](https://www.rerun.io/).
 Originally built for the LeRobot datasets:
 https://github.com/rerun-io/python-example-lerobot/assets/1148717/19e9983c-531f-4c48-9b37-37c5cbe1e0bd
+Deployed live on Hugging Face: https://huggingface.co/spaces/rerun/preview_dataset
+## Getting started (native)
 Requires Python 3.10 or higher.
 ```sh
 python main.py --dataset lerobot/aloha_sim_insertion_human
 ```
+## Getting started (gradio)
+```sh
+pip install -r requirements.txt
+uvicorn app:app --reload
+```
+## Example datasets to explore:
 * `lerobot/aloha_sim_insertion_human`
 * `lerobot/aloha_sim_insertion_scripted`
 * `lerobot/aloha_sim_transfer_cube_human`
 * `nateraw/kitti`
 * `sayakpaul/nyu_depth_v2`
+## Deploying to Hugging Face
+Hugging Face space runs off of the head `main` branch pushed to: https://huggingface.co/spaces/rerun/preview_dataset/tree/main
+To update this from the rerun repository, add the Hugging Face repository as an additional remote,
+and then push to it.
+```sh
+git remote add huggingface [email protected]:spaces/rerun/preview_dataset
+git push huggingface main
+```
 ## Note for the maintainer
 You can update this repository with the latest changes from https://github.com/rerun-io/rerun_template by running `scripts/template_update.py update --languages python`.

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+A Gradio app that uses Rerun to visualize a Hugging Face dataset.
+This app mounts the Gradio app inside of FastAPI in order to set the CORS headers.
+Run this from the terminal as you would normally start a FastAPI app: `uvicorn app:app`
+and navigate to http://localhost:8000 in your browser.
+"""
+from __future__ import annotations
+import urllib
+from pathlib import Path
+import gradio as gr
+import rerun as rr
+from datasets import load_dataset
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from dataset_conversion import log_dataset_to_rerun
+CUSTOM_PATH = "/"
+app = FastAPI()
+origins = [
+    "https://app.rerun.io",
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+)
+def html_template(rrd: str, app_url: str = "https://app.rerun.io") -> str:
+    encoded_url = urllib.parse.quote(rrd)
+    return f"""<div style="width:100%; height:70vh;"><iframe style="width:100%; height:100%;" src="{app_url}?url={encoded_url}" frameborder="0" allowfullscreen=""></iframe></div>"""
+def show_dataset(dataset_id: str, episode_index: int) -> str:
+    rr.init("dataset")
+    # TODO(jleibs): manage cache better and put in proper storage
+    filename = Path(f"tmp/{dataset_id}_{episode_index}.rrd")
+    if not filename.exists():
+        filename.parent.mkdir(parents=True, exist_ok=True)
+        rr.save(filename.as_posix())
+        dataset = load_dataset(dataset_id, split="train", streaming=True)
+        # This is for LeRobot datasets (https://huggingface.co/lerobot):
+        ds_subset = dataset.filter(
+            lambda frame: "episode_index" not in frame or frame["episode_index"] == episode_index
+        )
+        log_dataset_to_rerun(ds_subset)
+    return filename.as_posix()
+with gr.Blocks() as demo:
+    with gr.Row():
+        search_in = HuggingfaceHubSearch(
+            "lerobot/pusht",
+            label="Search Huggingface Hub",
+            placeholder="Search for models on Huggingface",
+            search_type="dataset",
+        )
+        episode_index = gr.Number(1, label="Episode Index")
+        button = gr.Button("Show Dataset")
+    with gr.Row():
+        rrd = gr.File()
+    with gr.Row():
+        viewer = gr.HTML()
+    button.click(show_dataset, inputs=[search_in, episode_index], outputs=rrd)
+    rrd.change(
+        html_template,
+        js="""(rrd) => { console.log(rrd.url); return rrd.url}""",
+        inputs=[rrd],
+        outputs=viewer,
+        preprocess=False,
+    )
+app = gr.mount_gradio_app(app, demo, path=CUSTOM_PATH)

dataset_conversion.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from __future__ import annotations
+import logging
+from typing import Any
+import numpy as np
+import rerun as rr
+from PIL import Image
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+def to_rerun(column_name: str, value: Any) -> Any:
+    """Do our best to interpret the value and convert it to a Rerun-compatible archetype."""
+    if isinstance(value, Image.Image):
+        if "depth" in column_name:
+            return rr.DepthImage(value)
+        else:
+            return rr.Image(value)
+    elif isinstance(value, np.ndarray):
+        return rr.Tensor(value)
+    elif isinstance(value, list):
+        if isinstance(value[0], float):
+            return rr.BarChart(value)
+        else:
+            return rr.TextDocument(str(value))  # Fallback to text
+    elif isinstance(value, float) or isinstance(value, int):
+        return rr.Scalar(value)
+    else:
+        return rr.TextDocument(str(value))  # Fallback to text
+def log_dataset_to_rerun(dataset: Any) -> None:
+    # Special time-like columns for LeRobot datasets (https://huggingface.co/datasets/lerobot/):
+    TIME_LIKE = {"index", "frame_id", "timestamp"}
+    # Ignore these columns (again, LeRobot-specific):
+    IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
+    for row in tqdm(dataset):
+        # Handle time-like columns first, since they set a state (time is an index in Rerun):
+        for column_name in TIME_LIKE:
+            if column_name in row:
+                cell = row[column_name]
+                if isinstance(cell, int):
+                    rr.set_time_sequence(column_name, cell)
+                elif isinstance(cell, float):
+                    rr.set_time_seconds(column_name, cell)  # assume seconds
+                else:
+                    print(f"Unknown time-like column {column_name} with value {cell}")
+        # Now log actual data columns:
+        for column_name, cell in row.items():
+            if column_name in TIME_LIKE or column_name in IGNORE:
+                continue
+            rr.log(column_name, to_rerun(column_name, cell))

lychee.toml CHANGED Viewed

@@ -93,6 +93,7 @@ exclude = [
   'https://stackoverflow.com/.',         # Stackoverflow links are no longer accessible from CI.
   'https://www.tensorflow.org/',         # tensorflow.org apparently blocks CI.
   'https://9p.io/sys/doc/lexnames.html', # Works locally but on ci we get: `Failed: Network error: error:0A000152:SSL routines:final_renegotiate:unsafe legacy renegotiation disabled:ssl/statem/extensions.c:946:`
   # Need GitHub login.
   'https://github.com/rerun-io/landing',

   'https://stackoverflow.com/.',         # Stackoverflow links are no longer accessible from CI.
   'https://www.tensorflow.org/',         # tensorflow.org apparently blocks CI.
   'https://9p.io/sys/doc/lexnames.html', # Works locally but on ci we get: `Failed: Network error: error:0A000152:SSL routines:final_renegotiate:unsafe legacy renegotiation disabled:ssl/statem/extensions.c:946:`
+  'https://huggingface.co/.*',           # huggingface.co apparently blocks CI and returns 401.
   # Need GitHub login.
   'https://github.com/rerun-io/landing',

main.py CHANGED Viewed

@@ -4,79 +4,32 @@ from __future__ import annotations
 import argparse
 import logging
-from typing import Any
-import numpy as np
 import rerun as rr
 from datasets import load_dataset
-from PIL import Image
-from tqdm import tqdm
-logger = logging.getLogger(__name__)
-def to_rerun(column_name: str, value: Any) -> Any:
-    """Do our best to interpret the value and convert it to a Rerun-compatible archetype."""
-    if isinstance(value, Image.Image):
-        if "depth" in column_name:
-            return rr.DepthImage(value)
-        else:
-            return rr.Image(value)
-    elif isinstance(value, np.ndarray):
-        return rr.Tensor(value)
-    elif isinstance(value, list):
-        if isinstance(value[0], float):
-            return rr.BarChart(value)
-        else:
-            return rr.TextDocument(str(value))  # Fallback to text
-    elif isinstance(value, float) or isinstance(value, int):
-        return rr.Scalar(value)
-    else:
-        return rr.TextDocument(str(value))  # Fallback to text
-def log_dataset_to_rerun(dataset) -> None:
-    # Special time-like columns for LeRobot datasets (https://huggingface.co/datasets/lerobot/):
-    TIME_LIKE = {"index", "frame_id", "timestamp"}
-    # Ignore these columns (again, LeRobot-specific):
-    IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
-    for row in tqdm(dataset):
-        # Handle time-like columns first, since they set a state (time is an index in Rerun):
-        for column_name in TIME_LIKE:
-            if column_name in row:
-                cell = row[column_name]
-                if isinstance(cell, int):
-                    rr.set_time_sequence(column_name, cell)
-                elif isinstance(cell, float):
-                    rr.set_time_seconds(column_name, cell)  # assume seconds
-                else:
-                    print(f"Unknown time-like column {column_name} with value {cell}")
-        # Now log actual data columns:
-        for column_name, cell in row.items():
-            if column_name in TIME_LIKE or column_name in IGNORE:
-                continue
-            rr.log(column_name, to_rerun(column_name, cell))
-def main():
     # Ensure the logging gets written to stderr:
     logging.getLogger().addHandler(logging.StreamHandler())
     logging.getLogger().setLevel(logging.INFO)
     parser = argparse.ArgumentParser(description="Log a HuggingFace dataset to Rerun.")
     parser.add_argument("--dataset", default="lerobot/pusht", help="The name of the dataset to load")
-    parser.add_argument("--episode-id", default=1, help="Which episode to select")
     args = parser.parse_args()
     print("Loading dataset…")
     dataset = load_dataset(args.dataset, split="train", streaming=True)
     # This is for LeRobot datasets (https://huggingface.co/lerobot):
-    ds_subset = dataset.filter(lambda frame: "episode_id" not in frame or frame["episode_id"] == args.episode_id)
     print("Starting Rerun…")
     rr.init(f"rerun_example_huggingface {args.dataset}", spawn=True)

 import argparse
 import logging
 import rerun as rr
 from datasets import load_dataset
+from dataset_conversion import log_dataset_to_rerun
+logger = logging.getLogger(__name__)
+def main() -> None:
     # Ensure the logging gets written to stderr:
     logging.getLogger().addHandler(logging.StreamHandler())
     logging.getLogger().setLevel(logging.INFO)
     parser = argparse.ArgumentParser(description="Log a HuggingFace dataset to Rerun.")
     parser.add_argument("--dataset", default="lerobot/pusht", help="The name of the dataset to load")
+    parser.add_argument("--episode-index", default=1, help="Which episode to select")
     args = parser.parse_args()
     print("Loading dataset…")
     dataset = load_dataset(args.dataset, split="train", streaming=True)
     # This is for LeRobot datasets (https://huggingface.co/lerobot):
+    ds_subset = dataset.filter(
+        lambda frame: "episode_index" not in frame or frame["episode_index"] == args.episode_index
+    )
     print("Starting Rerun…")
     rr.init(f"rerun_example_huggingface {args.dataset}", spawn=True)

requirements.txt CHANGED Viewed

@@ -1,5 +1,7 @@
 datasets
 h5py
 pillow
 rerun-sdk>=0.15.0,<0.16.0
 tqdm

 datasets
 h5py
+gradio==4.27.0
+gradio_huggingfacehub_search
 pillow
 rerun-sdk>=0.15.0,<0.16.0
 tqdm