jleibs commited on
Commit
a235944
·
1 Parent(s): 44c6d8a

Break out the dataset_conversion pieces

Browse files
Files changed (2) hide show
  1. dataset_conversion.py +57 -0
  2. main.py +3 -52
dataset_conversion.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any
3
+
4
+ import numpy as np
5
+ import rerun as rr
6
+ from datasets import load_dataset
7
+ from PIL import Image
8
+ from tqdm import tqdm
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def to_rerun(column_name: str, value: Any) -> Any:
14
+ """Do our best to interpret the value and convert it to a Rerun-compatible archetype."""
15
+ if isinstance(value, Image.Image):
16
+ if "depth" in column_name:
17
+ return rr.DepthImage(value)
18
+ else:
19
+ return rr.Image(value)
20
+ elif isinstance(value, np.ndarray):
21
+ return rr.Tensor(value)
22
+ elif isinstance(value, list):
23
+ if isinstance(value[0], float):
24
+ return rr.BarChart(value)
25
+ else:
26
+ return rr.TextDocument(str(value)) # Fallback to text
27
+ elif isinstance(value, float) or isinstance(value, int):
28
+ return rr.Scalar(value)
29
+ else:
30
+ return rr.TextDocument(str(value)) # Fallback to text
31
+
32
+
33
+ def log_dataset_to_rerun(dataset: Any) -> None:
34
+ # Special time-like columns for LeRobot datasets (https://huggingface.co/datasets/lerobot/):
35
+ TIME_LIKE = {"index", "frame_id", "timestamp"}
36
+
37
+ # Ignore these columns (again, LeRobot-specific):
38
+ IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
39
+
40
+ for row in tqdm(dataset):
41
+ # Handle time-like columns first, since they set a state (time is an index in Rerun):
42
+ for column_name in TIME_LIKE:
43
+ if column_name in row:
44
+ cell = row[column_name]
45
+ if isinstance(cell, int):
46
+ rr.set_time_sequence(column_name, cell)
47
+ elif isinstance(cell, float):
48
+ rr.set_time_seconds(column_name, cell) # assume seconds
49
+ else:
50
+ print(f"Unknown time-like column {column_name} with value {cell}")
51
+
52
+ # Now log actual data columns:
53
+ for column_name, cell in row.items():
54
+ if column_name in TIME_LIKE or column_name in IGNORE:
55
+ continue
56
+
57
+ rr.log(column_name, to_rerun(column_name, cell))
main.py CHANGED
@@ -4,65 +4,16 @@ from __future__ import annotations
4
 
5
  import argparse
6
  import logging
7
- from typing import Any
8
 
9
- import numpy as np
10
  import rerun as rr
11
  from datasets import load_dataset
12
- from PIL import Image
13
- from tqdm import tqdm
14
 
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- def to_rerun(column_name: str, value: Any) -> Any:
19
- """Do our best to interpret the value and convert it to a Rerun-compatible archetype."""
20
- if isinstance(value, Image.Image):
21
- if "depth" in column_name:
22
- return rr.DepthImage(value)
23
- else:
24
- return rr.Image(value)
25
- elif isinstance(value, np.ndarray):
26
- return rr.Tensor(value)
27
- elif isinstance(value, list):
28
- if isinstance(value[0], float):
29
- return rr.BarChart(value)
30
- else:
31
- return rr.TextDocument(str(value)) # Fallback to text
32
- elif isinstance(value, float) or isinstance(value, int):
33
- return rr.Scalar(value)
34
- else:
35
- return rr.TextDocument(str(value)) # Fallback to text
36
-
37
-
38
- def log_dataset_to_rerun(dataset) -> None:
39
- # Special time-like columns for LeRobot datasets (https://huggingface.co/datasets/lerobot/):
40
- TIME_LIKE = {"index", "frame_id", "timestamp"}
41
 
42
- # Ignore these columns (again, LeRobot-specific):
43
- IGNORE = {"episode_data_index_from", "episode_data_index_to", "episode_id"}
44
-
45
- for row in tqdm(dataset):
46
- # Handle time-like columns first, since they set a state (time is an index in Rerun):
47
- for column_name in TIME_LIKE:
48
- if column_name in row:
49
- cell = row[column_name]
50
- if isinstance(cell, int):
51
- rr.set_time_sequence(column_name, cell)
52
- elif isinstance(cell, float):
53
- rr.set_time_seconds(column_name, cell) # assume seconds
54
- else:
55
- print(f"Unknown time-like column {column_name} with value {cell}")
56
-
57
- # Now log actual data columns:
58
- for column_name, cell in row.items():
59
- if column_name in TIME_LIKE or column_name in IGNORE:
60
- continue
61
-
62
- rr.log(column_name, to_rerun(column_name, cell))
63
 
64
 
65
- def main():
66
  # Ensure the logging gets written to stderr:
67
  logging.getLogger().addHandler(logging.StreamHandler())
68
  logging.getLogger().setLevel(logging.INFO)
 
4
 
5
  import argparse
6
  import logging
 
7
 
 
8
  import rerun as rr
9
  from datasets import load_dataset
 
 
10
 
11
+ from dataset_conversion import log_dataset_to_rerun
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
+ def main() -> None:
17
  # Ensure the logging gets written to stderr:
18
  logging.getLogger().addHandler(logging.StreamHandler())
19
  logging.getLogger().setLevel(logging.INFO)