Spaces:

liruiw
/

hma

Running on Zero

App Files Files Community

LeroyWaa commited on Nov 27, 2024

Commit

246c106

1 Parent(s): 1d541d9

draft

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +40 -13
app.py +69 -0
build.sh +12 -0
common/__init__.py +0 -0
common/calculate_fvd.py +80 -0
common/data_sampler.py +334 -0
common/eval_utils.py +105 -0
common/fid_score.py +382 -0
common/fvd/styleganv/fvd.py +90 -0
common/fvd/styleganv/i3d_torchscript.pt +3 -0
common/fvd/videogpt/fvd.py +137 -0
common/fvd/videogpt/i3d_pretrained_400.pt +3 -0
common/fvd/videogpt/pytorch_i3d.py +322 -0
common/inception.py +344 -0
common/plot/__init__.py +0 -0
common/plot/aggregated_output.csv +18 -0
common/plot/plot_arch_ablation.py +60 -0
common/plot/plot_arch_ablation_deltapsnr.py +49 -0
common/plot/plot_dataset_scale.py +69 -0
common/plot/plot_dataset_traj_scale.py +48 -0
common/plot/plot_dynamics_ablation.py +56 -0
common/plot/plot_dynamics_ablation_deltapsnr.py +51 -0
common/plot/plot_from_wandb.py +185 -0
common/plot/plot_from_wandb_singledataset.py +144 -0
common/plot/plot_model_scale.py +64 -0
common/plot/plot_pretrain_ablation.py +44 -0
common/plot/plot_pretrain_ablation_mar.py +45 -0
cont_data.py +245 -0
data.py +240 -0
datasets/.DS_Store +0 -0
datasets/__init__.py +0 -0
datasets/encode_extern_dataset.py +291 -0
datasets/encode_openx_dataset.py +459 -0
datasets/extern/__init__.py +0 -0
datasets/extern/ego4d.py +193 -0
datasets/extern/egoexo4d.py +186 -0
datasets/extern/epic_kitchen.py +115 -0
datasets/extern/frodobot.py +128 -0
datasets/extern/robomimic.py +108 -0
datasets/merge_shards.py +113 -0
datasets/utils.py +244 -0
experiments/.DS_Store +0 -0
experiments/datasplit/.DS_Store +0 -0
experiments/datasplit/dataset1.yaml +2 -0
experiments/datasplit/dataset10.yaml +11 -0
experiments/datasplit/dataset15.yaml +16 -0
experiments/datasplit/dataset15_vae.yaml +16 -0
experiments/datasplit/dataset20.yaml +21 -0
experiments/datasplit/dataset20_vae.yaml +21 -0
experiments/datasplit/dataset25.yaml +26 -0

README.md CHANGED Viewed

@@ -1,13 +1,40 @@
----
-title: Hma
-emoji: 📉
-colorFrom: gray
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.6.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Heterogeneous World Modeling with Actions
+Progress in video generation may soon make it possible to evaluate robot policies in a completely learned world model.
+Modified from [here](https://github.com/1x-technologies/1xgpt)
+## Getting Started
+We require `Python 3.10` or later. This code was tested with `Python 3.10.12`.
+```
+# Install dependencies and download data
+./build.sh
+# Source the Python environment
+source venv/bin/activate
+```
+## File Structures
+```angular2html
+├── ...
+├── HPT-Video
+|   |── data 			# cached token datasets and model checkpoints
+|   |── genie 			# main modeling code
+|   |   |── diffusion   # diffusion loss related
+|   |   |── evaluate.py   # evaluate a trained model
+|   |   |── st_mar.py   # spatial time MAR
+|   |   |── generate.py   # generate tokens from trained model
+|   |   |── st_maskgit.py   # spatial-time maskgit
+|   |── magvit 			# magvit code
+|   |── sim 			# simulation related codebase
+|   |── experiments
+|   |   |── cmd     # handy commands
+|   |   |── datasplit # dataset split
+|   |   |── scripts # ablation and training scripts.
+|   |── common 		# common utility and plot scripts
+|   |── train.py # train using magvit
+|   |── train_diffusion.py # train using mar
+|   |── train_multi.py # train on multiple datasets jointly
+|   |── visualize.py # visualize generated tokens
+└── ...
+```

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gradio as gr
+import numpy as np
+from PIL import Image
+import cv2
+from sim.simulator import GenieSimulator
+RES = 512
+image = Image.open("sim/assets/langtable_prompt/frame_06.png")
+genie = GenieSimulator(
+    image_encoder_type='temporalvae',
+    image_encoder_ckpt='stabilityai/stable-video-diffusion-img2vid',
+    quantize=False,
+    backbone_type='stmar',
+    backbone_ckpt='data/mar_ckpt/langtable',
+    prompt_horizon=11,
+    action_stride=1,
+    domain='language_table',
+)
+prompt_image = np.tile(
+    np.array(image), (genie.prompt_horizon, 1, 1, 1)
+).astype(np.uint8)
+prompt_action = np.zeros(
+    (genie.prompt_horizon, genie.action_stride, 2)
+).astype(np.float32)
+genie.set_initial_state((prompt_image, prompt_action))
+image = genie.reset()
+image = cv2.resize(image, (RES, RES))
+image = Image.fromarray(image)
+# Example model: takes a direction and returns a random image
+def model(direction: str, genie=genie):
+    if direction == 'right':
+        action = np.array([0, 0.05])
+    elif direction == 'left':
+        action = np.array([0, -0.05])
+    elif direction == 'down':
+        action = np.array([0.05, 0])
+    elif direction == 'up':
+        action = np.array([-0.05, 0])
+    else:
+        raise ValueError(f"Invalid direction: {direction}")
+    next_image = genie.step(action)['pred_next_frame']
+    next_image = cv2.resize(next_image, (RES, RES))
+    return Image.fromarray(next_image)
+# Gradio function to handle user input
+def handle_input(direction):
+    print(f"User clicked: {direction}")
+    new_image = model(direction)  # Get a new image from the model
+    return new_image
+if __name__ == '__main__':
+    with gr.Blocks() as demo:
+        with gr.Row():
+            image_display = gr.Image(value=image, type="pil", label="Generated Image")
+        with gr.Row():
+            up = gr.Button("↑ Up")
+        with gr.Row():
+            left = gr.Button("← Left")
+            down = gr.Button("↓ Down")
+            right = gr.Button("→ Right")
+        # Define button interactions
+        up.click(fn=lambda: handle_input("up"), outputs=image_display)
+        down.click(fn=lambda: handle_input("down"), outputs=image_display)
+        left.click(fn=lambda: handle_input("left"), outputs=image_display)
+        right.click(fn=lambda: handle_input("right"), outputs=image_display)
+    demo.launch()

build.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/usr/bin/bash
+python3 -m venv venv
+source venv/bin/activate
+python -m pip install -r requirements.txt
+FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE python -m pip install flash-attn==2.5.8 --no-build-isolation
+# Download datasets to data/train_v1.0, data/val_v1.0
+huggingface-cli download 1x-technologies/worldmodel --repo-type dataset --local-dir data
+mv data/val_v1.1 data/1x_humanoid_magvit_traj1000_val
+mv data/train_v1.1 data/1x_humanoid_magvit_traj1000_train

common/__init__.py ADDED Viewed

File without changes

common/calculate_fvd.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code adapted from https://github.com/JunyaoHu/common_metrics_on_video_quality
+import numpy as np
+import torch
+from tqdm import tqdm
+def trans(x):
+    # if greyscale images add channel
+    if x.shape[-3] == 1:
+        x = x.repeat(1, 1, 3, 1, 1)
+    # permute BTCHW -> BCTHW
+    x = x.permute(0, 2, 1, 3, 4)
+    return x
+def calculate_fvd(videos1, videos2, device="cuda", method='styleganv'):
+    if method == 'styleganv':
+        from .fvd.styleganv.fvd import get_fvd_feats, frechet_distance, load_i3d_pretrained
+    elif method == 'videogpt':
+        from .fvd.videogpt.fvd import load_i3d_pretrained
+        from .fvd.videogpt.fvd import get_fvd_logits as get_fvd_feats
+        from .fvd.videogpt.fvd import frechet_distance
+    # videos [batch_size, timestamps, channel, h, w]
+    assert videos1.shape == videos2.shape
+    i3d = load_i3d_pretrained(device=device)
+    fvd_results = []
+    # support grayscale input, if grayscale -> channel*3
+    # BTCHW -> BCTHW
+    # videos -> [batch_size, channel, timestamps, h, w]
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+    # fvd_results = {}
+    # for calculate FVD, each clip_timestamp must >= 10
+    for clip_timestamp in tqdm(range(10, videos1.shape[-3]+1)):
+        # print("clip_timestamp", clip_timestamp)
+        # get a video clip
+        # videos_clip [batch_size, channel, timestamps[:clip], h, w]
+        videos_clip1 = videos1[:, :, : clip_timestamp]
+        videos_clip2 = videos2[:, :, : clip_timestamp]
+        # get FVD features
+        feats1 = get_fvd_feats(videos_clip1, i3d=i3d, device=device)
+        feats2 = get_fvd_feats(videos_clip2, i3d=i3d, device=device)
+        # calculate FVD when timestamps[:clip]
+        fvd_results.append(frechet_distance(feats1, feats2))
+    return fvd_results[-1] # only the last step
+# test code / using example
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    device = torch.device("cuda")
+    # device = torch.device("cpu")
+    import json
+    result = calculate_fvd(videos1, videos2, device, method='videogpt')
+    print(json.dumps(result, indent=4))
+    result = calculate_fvd(videos1, videos2, device, method='styleganv')
+    print(json.dumps(result, indent=4))
+if __name__ == "__main__":
+    main()

common/data_sampler.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import copy
+import os
+import random
+from operator import itemgetter
+from typing import Optional, List
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.distributed as dist
+from PIL import Image
+from torch.utils.data import Dataset, Sampler
+from torch.utils.data import Sampler, DistributedSampler
+def chunk_indices(indices: list[int], size: int) -> tuple[torch.Tensor, ...]:
+    return torch.split(torch.tensor(indices), size)
+class CombinedDataLoader:
+    def __init__(self, dataloaders, reinit=True):
+        """
+        :param dataloaders: list of pytorch dataloaders
+        """
+        self.dataloaders = dataloaders
+        self.reinit = reinit
+        self.dataloader_idx = 0
+        self.loader_iters = [iter(dataloader) for dataloader in self.dataloaders]
+    def __iter__(self):
+        return self
+    def __next__(self):
+        # Choose a dataloader based on weights
+        chosen_loader_iter = self.loader_iters[self.dataloader_idx]
+        try:
+            data = next(chosen_loader_iter)
+            return data
+        except StopIteration:
+            # Handle case where a dataloader is exhausted. Reinitialize the iterator.
+            self.dataloader_idx = self.dataloader_idx + 1
+            if self.dataloader_idx == len(self.loader_iters):
+                self.dataloader_idx = 0  # reset
+                raise StopIteration
+            return self.__next__()
+    def __len__(self):
+        return sum([len(dataloader) for dataloader in self.dataloaders])
+class CombinedBatchSampler(torch.utils.data.Sampler):
+    # For validation dataloaders.
+    def __init__(self, datasets, batch_size, num_processes=1, shuffle=False):
+        super().__init__()  # no-op
+        prev_idx = 0
+        all_batches = []
+        for dataset in datasets:
+            indices = list(range(prev_idx, prev_idx + len(dataset)))
+            if shuffle:
+                random.shuffle(indices)
+            # exclude remainer, if necessary
+            remainder = len(indices) % (batch_size * num_processes)
+            if remainder > 0:
+                indices = indices[:-remainder]  # exclude last
+            chunk_i = chunk_indices(indices, batch_size)  # equally sized
+            all_batches += chunk_i
+            # add the new indices without the last batch
+            prev_idx += len(chunk_i) * batch_size  # len(dataset)
+        if shuffle:
+            random.shuffle(all_batches)
+        self.all_batches = all_batches
+    def __iter__(self):
+        return iter(self.all_batches)
+    def __len__(self):
+        return len(self.all_batches)
+# https://github.com/catalyst-team/catalyst/blob/master/catalyst/data/sampler.py
+class DatasetFromSampler(Dataset):
+    """Dataset to create indexes from `Sampler`.
+    Args:
+        sampler: PyTorch sampler
+    """
+    def __init__(self, sampler: Sampler):
+        """Initialisation for DatasetFromSampler."""
+        self.sampler = sampler
+        self.sampler_list = None
+    def __getitem__(self, index: int):
+        """Gets element of the dataset.
+        Args:
+            index: index of the element in the dataset
+        Returns:
+            Single element by index
+        """
+        if self.sampler_list is None:
+            self.sampler_list = list(self.sampler)
+        return self.sampler_list[index]
+    def __len__(self) -> int:
+        """
+        Returns:
+            int: length of the dataset
+        """
+        return len(self.sampler)
+class DistributedSamplerWrapper(DistributedSampler):
+    """
+    Wrapper over `Sampler` for distributed training.
+    Allows you to use any sampler in distributed mode.
+    It is especially useful in conjunction with
+    `torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSamplerWrapper instance as a DataLoader
+    sampler, and load a subset of subsampled data of the original dataset
+    that is exclusive to it.
+    .. note::
+        Sampler is assumed to be of constant size.
+    """
+    def __init__(
+        self,
+        sampler,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+    ):
+        """
+        Args:
+            sampler: Sampler used for subsampling
+            num_replicas (int, optional): Number of processes participating in
+                distributed training
+            rank (int, optional): Rank of the current process
+                within ``num_replicas``
+            shuffle (bool, optional): If true (default),
+                sampler will shuffle the indices
+        """
+        super(DistributedSamplerWrapper, self).__init__(
+            DatasetFromSampler(sampler),
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+        )
+        self.sampler = sampler
+    def __iter__(self):
+        """Iterate over sampler.
+        Returns:
+            python iterator
+        """
+        self.dataset = DatasetFromSampler(self.sampler)
+        indexes_of_indexes = super().__iter__()
+        subsampler_indexes = self.dataset
+        return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes))
+# https://github.com/rabeehk/hyperformer/blob/main/hyperformer/data/multitask_sampler.py
+class MultiTaskBatchSampler(Sampler):
+    """Defines a sampler to sample multiple datasets with temperature sampling
+    in a distributed fashion."""
+    def __init__(
+        self,
+        dataset_sizes: List[int],
+        batch_size: int,
+        temperature: float,
+        dataset_groups=[],
+        num_replicas: Optional[int] = 1,
+        rank: Optional[int] = 0,
+        seed: int = 0,
+        shuffle: bool = True,
+        shuffle_task: bool = True,
+    ) -> None:
+        """Constructor for MultiTaskBatchSampler.
+        Args:
+            dataset_sizes: a list of integers, specifies the number of samples in
+                each dataset.
+            batch_size: integer, specifies the batch size.
+            temperature: float, temperature used for temperature sampling. The larger
+                the value, the datasets are sampled equally, and for value of 0, the datasets
+                will be sampled according to their number of samples.
+            num_replicas: integer, specifies the number of processes.
+            rank: integer, specifies the rank of the current process/
+            seed: integer, random seed.
+            shuffle: bool, if set to true, the datasets will be shuffled in each epoch.
+        """
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+            print("data sampler rank:", rank)
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval" " [0, {}]".format(rank, num_replicas - 1)
+            )
+        self.dataset_groups = dataset_groups
+        print("dataset groups:", self.dataset_groups)
+        self.num_replicas = num_replicas
+        self.shuffle_task = shuffle_task
+        self.rank = rank
+        self.batch_size = batch_size
+        self.dataset_sizes = dataset_sizes
+        # By default we drop the last elements if dataset is not divisible by the number of ranks.
+        self.rank_dataset_sizes = [dataset_size // self.num_replicas for dataset_size in self.dataset_sizes]
+        self.dataset_offsets = torch.cumsum(torch.LongTensor([0] + dataset_sizes), 0)
+        self.total_sizes = [
+            (dataset_size // self.num_replicas) * self.num_replicas for dataset_size in self.dataset_sizes
+        ]
+        self.temperature = temperature
+        self.seed = seed
+        self.epoch = 0
+        self.num_batches_per_epoch = (
+            (np.sum(dataset_sizes) + self.batch_size - 1) // self.batch_size // self.num_replicas
+        )
+        self.shuffle = shuffle
+        print(f"{num_replicas=} {rank=} {self.num_batches_per_epoch=} {self.total_sizes=} self.weights={self.generate_tasks_distribution()}")
+    def generate_tasks_distribution(self):
+        """Given the dataset sizes computes the weights to sample each dataset
+        according to the temperature sampling."""
+        if len(self.dataset_groups) > 0:
+            # normalize across groups first
+            weights = []
+            num_groups = len(self.dataset_groups)
+            for group in self.dataset_groups:
+                lo, hi = group
+                dataset_sizes = [self.dataset_sizes[idx] for idx in range(lo, hi)]
+                total_size = sum(dataset_sizes)
+                group_weights = np.array([(size / total_size) ** (1.0 / self.temperature) for size in dataset_sizes])
+                group_weights = group_weights / np.sum(group_weights) / num_groups
+                weights = np.concatenate((weights, group_weights))
+        else:
+            total_size = sum(self.dataset_sizes)
+            weights = np.array([(size / total_size) ** (1.0 / self.temperature) for size in self.dataset_sizes])
+            weights = weights / np.sum(weights)
+        return torch.as_tensor(weights, dtype=torch.double)
+    def __iter__(self):
+        # Defines torch generator, to make random choices consistent across cores in
+        # different epochs, the seed needs to be set based on seed and epoch.
+        generator = torch.Generator()
+        generator.manual_seed(self.seed + self.epoch)
+        # Shuffles the datasets if shuffle is set to true.
+        indices = []
+        for dataset_size in self.dataset_sizes:
+            if self.shuffle:
+                indices.append(torch.randperm(dataset_size, generator=generator).tolist())
+            else:
+                indices.append(list(range(dataset_size)))
+        # Shards the datasets across the all processes.
+        self.rank_indices = []
+        for i in range(len(self.dataset_sizes)):
+            self.rank_indices.append(indices[i][self.rank : self.total_sizes[i] : self.num_replicas])
+        # To make the model consistent across different processes, since the
+        # model is based on tasks, we need to make sure the same task is selected
+        # across different processes.
+        tasks_distribution: torch.Tensor = self.generate_tasks_distribution()
+        # Chooses the tasks which will be used in each batch in one epoch.
+        # With passing generator, we make sure this choice is consistent across
+        # different processes.
+        # want them to be different.
+        if self.shuffle_task:
+            generator.manual_seed(self.seed + self.epoch + self.rank)
+        batch_task_assignments = torch.multinomial(
+            tasks_distribution, self.num_batches_per_epoch, replacement=True, generator=generator
+        )
+        for batch_task in batch_task_assignments:
+            # Gets the number of samples of the selected datasets available for the current rank.
+            num_task_samples = self.rank_dataset_sizes[batch_task]
+            # Computes the random samples from the chosen dataset.
+            indices = torch.randint(low=0, high=num_task_samples, size=(self.batch_size,), generator=generator).tolist()
+            # Converts the selected indices to the global indices on the given dataset.
+            results = (self.dataset_offsets[batch_task] + torch.tensor(self.rank_indices[batch_task])[indices]).tolist()
+            yield results
+    def __len__(self):
+        return self.num_batches_per_epoch
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+def make_dataset_pie_plot(domains, traj_nums):
+    """draw the dataset mixture as a pie plot"""
+    new_domains = []
+    for idx, domain in enumerate(domains):
+        new_domains.append(domain)
+    plt.cla()
+    fig1, ax1 = plt.subplots(figsize=(40, 40))
+    traj_prob = np.array(traj_nums) / np.sum(traj_nums)
+    tab20 = plt.get_cmap("tab20").colors
+    tab20b = plt.get_cmap("tab20b").colors
+    tab20c = plt.get_cmap("tab20c").colors
+    # Combine them to get 60 distinct colors
+    colors = tab20 + tab20b + tab20c
+    patches, _ = ax1.pie(traj_prob, startangle=90, colors=colors[: len(traj_prob)])
+    ax1.axis("equal")
+    ax1.legend(patches, new_domains, loc="center left", bbox_to_anchor=(0.8, 0.5), prop={"size": 32})
+    fig1.canvas.draw()
+    return Image.frombytes("RGB", fig1.canvas.get_width_height(), fig1.canvas.tostring_rgb())

common/eval_utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from typing import Callable
+import torch
+import torchvision.transforms.functional as transforms_f
+from einops import rearrange
+from genie.factorization_utils import factorize_labels
+class AvgMetric:
+    """ Records a running sum and count to compute the mean. """
+    def __init__(self):
+        self.total = 0
+        self.count = 0
+    def update(self, val, batch_size=1):
+        self.total += val * batch_size
+        self.count += batch_size
+    def update_list(self, flat_vals):
+        self.total += sum(flat_vals)
+        self.count += len(flat_vals)
+    def mean(self):
+        if self.count == 0:
+            return 0
+        return self.total / self.count
+def decode_tokens(reshaped_token_ids: torch.LongTensor, decode_latents: Callable) -> torch.ByteTensor:
+    """
+    Converts quantized latent space tokens to images.
+    Args:
+        reshaped_token_ids: shape (B, T, H, W).
+        decode_latents: instance of `decode_latents_wrapper()`
+    Returns:
+        (B, T, 3, 256, 256)
+    """
+    decoded_imgs = decode_latents(rearrange(reshaped_token_ids, "b t h w -> (b t) h w").cpu().numpy())
+    decoded_tensor = torch.stack([transforms_f.pil_to_tensor(pred_img) for pred_img in decoded_imgs])
+    return rearrange(decoded_tensor, "(b t) c H W -> b t c H W", b=reshaped_token_ids.size(0))
+def decode_features(reshaped_token_ids: torch.LongTensor, decode_latents: Callable) -> torch.ByteTensor:
+    """
+    Converts quantized latent space tokens to images.
+    Args:
+        reshaped_token_ids: shape (B, T, H, W).
+        decode_latents: instance of `decode_latents_wrapper()`
+    Returns:
+        (B, T, 3, 256, 256)
+    """
+    decoded_imgs = decode_latents(rearrange(reshaped_token_ids, "b t h w c -> (b t) c h w").cpu().numpy())
+    decoded_tensor = torch.stack([transforms_f.pil_to_tensor(pred_img) for pred_img in decoded_imgs])
+    return rearrange(decoded_tensor, "(b t) c H W -> b t c H W", b=reshaped_token_ids.size(0))
+def compute_loss(
+        labels_flat: torch.LongTensor,
+        factored_logits: torch.FloatTensor,
+        num_factored_vocabs: int = 2,
+        factored_vocab_size: int = 512,
+) -> float:
+    """
+    If applicable (model returns logits), compute the cross entropy loss.
+    In the case of a factorized vocabulary, sums the cross entropy losses for each vocabulary.
+    Assuming all submissions use the parametrization of num_factored_vocabs = 2, factored_vocab_size = 512
+    Args:
+        labels_flat: size (B, T*H*W) corresponding to flattened, tokenized images.
+        factored_logits: size (B, factored_vocab_size, num_factored_vocabs, T-1, H, W).
+            E.g. output of `genie.evaluate.GenieEvaluator.predict_zframe_logits()`
+        num_factored_vocabs: Should be 2 for v1.0 of the challenge.
+        factored_vocab_size: Should be 512 for v1.0 of the challenge.
+    Returns:
+        Cross entropy loss
+    """
+    assert factored_logits.dim() == 6 \
+           and factored_logits.size()[:3] == (labels_flat.size(0), factored_vocab_size, num_factored_vocabs), \
+           f"Shape of `logits` should be (B, {factored_vocab_size}, {num_factored_vocabs}, T-1, H, W)"
+    t = factored_logits.size(3) + 1
+    h, w = factored_logits.size()[-2:]
+    assert t * h * w == labels_flat.size(1), "Shape of `factored_logits` does not match flattened latent image size."
+    labels_THW = rearrange(labels_flat, "b (t h w) -> b t h w", t=t, h=h, w=w)
+    labels_THW = labels_THW[:, 1:].to(factored_logits.device)
+    factored_labels = factorize_labels(labels_THW, num_factored_vocabs, factored_vocab_size)
+    return torch.nn.functional.cross_entropy(factored_logits, factored_labels, reduction="none")\
+        .sum(dim=1).mean().item()  # Final loss is the sum of the two losses across the size-512 vocabularies
+def compute_lpips(frames_a: torch.ByteTensor, frames_b: torch.ByteTensor, lpips_func: Callable) -> list:
+    """
+    Given two batches of video data, of shape (B, T, 3, 256, 256), computes the LPIPS score on frame-by-frame level.
+    Cannot use `lpips_func` directly because it expects at most 4D input.
+    """
+    # LPIPS expects pixel values between [-1, 1]
+    flattened_a, flattened_b = [rearrange(frames / 127.5 - 1, "b t c H W -> (b t) c H W")
+                                for frames in (frames_a, frames_b)]
+    return lpips_func(flattened_a, flattened_b).flatten().tolist()

common/fid_score.py ADDED Viewed

	@@ -0,0 +1,382 @@

+"""Calculates the Frechet Inception Distance (FID) to evalulate GANs
+The FID metric calculates the distance between two distributions of images.
+Typically, we have summary statistics (mean & covariance matrix) of one
+of these distributions, while the 2nd distribution is given by a GAN.
+When run as a stand-alone program, it compares the distribution of
+images that are stored as PNG/JPEG at a specified location with a
+distribution given by summary statistics (in pickle format).
+The FID is calculated by assuming that X_1 and X_2 are the activations of
+the pool_3 layer of the inception net for generated samples and real world
+samples respectively.
+See --help to see further details.
+Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
+of Tensorflow
+Copyright 2018 Institute of Bioinformatics, JKU Linz
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+# code adapted from https://github.com/mseitzer/pytorch-fid/tree/master
+import os
+import pathlib
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+import numpy as np
+import torch
+import torchvision.transforms as TF
+from PIL import Image
+from scipy import linalg
+from torch.nn.functional import adaptive_avg_pool2d
+try:
+    from tqdm import tqdm
+except ImportError:
+    # If tqdm is not available, provide a mock version of it
+    def tqdm(x):
+        return x
+from .inception import InceptionV3
+IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
+class ImagePathDataset(torch.utils.data.Dataset):
+    def __init__(self, files, transforms=None):
+        self.files = files
+        self.transforms = transforms
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, i):
+        path = self.files[i]
+        img = Image.open(path).convert("RGB")
+        if self.transforms is not None:
+            img = self.transforms(img)
+        return img
+def get_activations(
+    files, model, batch_size=50, dims=2048, device="cpu", num_workers=1
+):
+    """Calculates the activations of the pool_3 layer for all images.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : Batch size of images for the model to process at once.
+                     Make sure that the number of samples is a multiple of
+                     the batch size, otherwise some samples are ignored. This
+                     behavior is retained to match the original FID score
+                     implementation.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- A numpy array of dimension (num images, dims) that contains the
+       activations of the given tensor when feeding inception with the
+       query tensor.
+    """
+    model.eval()
+    if batch_size > len(files):
+        print(
+            (
+                "Warning: batch size is bigger than the data size. "
+                "Setting batch size to data size"
+            )
+        )
+        batch_size = len(files)
+    dataset = ImagePathDataset(files, transforms=TF.ToTensor())
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=num_workers,
+    )
+    pred_arr = np.empty((len(files), dims))
+    start_idx = 0
+    for batch in tqdm(dataloader):
+        batch = batch.to(device)
+        with torch.no_grad():
+            pred = model(batch)[0]
+        # If model output is not scalar, apply global spatial average pooling.
+        # This happens if you choose a dimensionality not equal 2048.
+        if pred.size(2) != 1 or pred.size(3) != 1:
+            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
+        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
+        start_idx = start_idx + pred.shape[0]
+    return pred_arr
+def get_activations_images(
+    dataset, model, batch_size=50, dims=2048, device="cpu", num_workers=0
+):
+    """Calculates the activations of the pool_3 layer for all images.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : Batch size of images for the model to process at once.
+                     Make sure that the number of samples is a multiple of
+                     the batch size, otherwise some samples are ignored. This
+                     behavior is retained to match the original FID score
+                     implementation.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- A numpy array of dimension (num images, dims) that contains the
+       activations of the given tensor when feeding inception with the
+       query tensor.
+    """
+    model.eval()
+    # import IPython; IPython.embed()
+    # combine batch and temporal
+    dataset = torch.cat([dataset[:, i] for i in range(dataset.shape[1])], dim=0).to("cpu")
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=True,
+        num_workers=num_workers,
+    )
+    pred_arr = np.empty((len(dataset), dims))
+    start_idx = 0
+    for batch in tqdm(dataloader):
+        batch = batch.to(device)
+        with torch.no_grad():
+            pred = model(batch)[0]
+        # If model output is not scalar, apply global spatial average pooling.
+        # This happens if you choose a dimensionality not equal 2048.
+        if pred.size(2) != 1 or pred.size(3) != 1:
+            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
+        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
+        start_idx = start_idx + pred.shape[0]
+    return pred_arr
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert (
+        mu1.shape == mu2.shape
+    ), "Training and test mean vectors have different lengths"
+    assert (
+        sigma1.shape == sigma2.shape
+    ), "Training and test covariances have different dimensions"
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = (
+            "fid calculation produces singular product; "
+            "adding %s to diagonal of cov estimates"
+        ) % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError("Imaginary component {}".format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+def calculate_activation_statistics(
+    images, model, batch_size=50, dims=2048, device="cpu", num_workers=1
+):
+    """Calculation of the statistics used by the FID.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : The images numpy array is split into batches with
+                     batch size batch_size. A reasonable batch size
+                     depends on the hardware.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- mu    : The mean over samples of the activations of the pool_3 layer of
+               the inception model.
+    -- sigma : The covariance matrix of the activations of the pool_3 layer of
+               the inception model.
+    """
+    act = get_activations_images(images, model, batch_size, dims, device, num_workers)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+def compute_statistics(images, model, batch_size, dims, device, num_workers=1):
+    m, s = calculate_activation_statistics(
+        images, model, batch_size, dims, device, num_workers
+    )
+    return m, s
+def calculate_fid(pred_images, gt_images, batch_size=16, device="cuda", dims=2048, num_workers=1):
+    """Calculates the FID of two paths"""
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = compute_statistics(
+        pred_images, model, batch_size, dims, device, num_workers
+    )
+    m2, s2 = compute_statistics(
+        gt_images, model, batch_size, dims, device, num_workers
+    )
+    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+def calculate_fid_given_paths(paths, batch_size, device, dims, num_workers=1):
+    """Calculates the FID of two paths"""
+    for p in paths:
+        if not os.path.exists(p):
+            raise RuntimeError("Invalid path: %s" % p)
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = compute_statistics_of_path(
+        paths[0], model, batch_size, dims, device, num_workers
+    )
+    m2, s2 = compute_statistics_of_path(
+        paths[1], model, batch_size, dims, device, num_workers
+    )
+    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+def save_fid_stats(paths, batch_size, device, dims, num_workers=1):
+    """Saves FID statistics of one path"""
+    if not os.path.exists(paths[0]):
+        raise RuntimeError("Invalid path: %s" % paths[0])
+    if os.path.exists(paths[1]):
+        raise RuntimeError("Existing output file: %s" % paths[1])
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    print(f"Saving statistics for {paths[0]}")
+    m1, s1 = compute_statistics_of_path(
+        paths[0], model, batch_size, dims, device, num_workers
+    )
+    np.savez_compressed(paths[1], mu=m1, sigma=s1)
+def main():
+    args = parser.parse_args()
+    if args.device is None:
+        device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
+    else:
+        device = torch.device(args.device)
+    if args.num_workers is None:
+        try:
+            num_cpus = len(os.sched_getaffinity(0))
+        except AttributeError:
+            # os.sched_getaffinity is not available under Windows, use
+            # os.cpu_count instead (which may not return the *available* number
+            # of CPUs).
+            num_cpus = os.cpu_count()
+        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
+    else:
+        num_workers = args.num_workers
+    if args.save_stats:
+        save_fid_stats(args.path, args.batch_size, device, args.dims, num_workers)
+        return
+    fid_value = calculate_fid_given_paths(
+        args.path, args.batch_size, device, args.dims, num_workers
+    )
+    print("FID: ", fid_value)
+if __name__ == "__main__":
+    main()

common/fvd/styleganv/fvd.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import os
+import math
+import torch.nn.functional as F
+# https://github.com/universome/fvd-comparison
+def load_i3d_pretrained(device=torch.device('cpu')):
+    i3D_WEIGHTS_URL = "https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt"
+    filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'i3d_torchscript.pt')
+    print(filepath)
+    if not os.path.exists(filepath):
+        print(f"preparing for download {i3D_WEIGHTS_URL}, you can download it by yourself.")
+        os.system(f"wget {i3D_WEIGHTS_URL} -O {filepath}")
+    i3d = torch.jit.load(filepath).eval().to(device)
+    i3d = torch.nn.DataParallel(i3d)
+    return i3d
+def get_feats(videos, detector, device, bs=10):
+    # videos : torch.tensor BCTHW [0, 1]
+    detector_kwargs = dict(rescale=False, resize=False, return_features=True) # Return raw features before the softmax layer.
+    feats = np.empty((0, 400))
+    with torch.no_grad():
+        for i in range((len(videos)-1)//bs + 1):
+            feats = np.vstack([feats, detector(torch.stack([preprocess_single(video) for video in videos[i*bs:(i+1)*bs]]).to(device), **detector_kwargs).detach().cpu().numpy()])
+    return feats
+def get_fvd_feats(videos, i3d, device, bs=10):
+    # videos in [0, 1] as torch tensor BCTHW
+    # videos = [preprocess_single(video) for video in videos]
+    embeddings = get_feats(videos, i3d, device, bs)
+    return embeddings
+def preprocess_single(video, resolution=224, sequence_length=None):
+    # video: CTHW, [0, 1]
+    c, t, h, w = video.shape
+    # temporal crop
+    if sequence_length is not None:
+        assert sequence_length <= t
+        video = video[:, :sequence_length]
+    # scale shorter side to resolution
+    scale = resolution / min(h, w)
+    if h < w:
+        target_size = (resolution, math.ceil(w * scale))
+    else:
+        target_size = (math.ceil(h * scale), resolution)
+    video = F.interpolate(video, size=target_size, mode='bilinear', align_corners=False)
+    # center crop
+    c, t, h, w = video.shape
+    w_start = (w - resolution) // 2
+    h_start = (h - resolution) // 2
+    video = video[:, :, h_start:h_start + resolution, w_start:w_start + resolution]
+    # [0, 1] -> [-1, 1]
+    video = (video - 0.5) * 2
+    return video.contiguous()
+"""
+Copy-pasted from https://github.com/cvpr2022-stylegan-v/stylegan-v/blob/main/src/metrics/frechet_video_distance.py
+"""
+from typing import Tuple
+from scipy.linalg import sqrtm
+import numpy as np
+def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    mu = feats.mean(axis=0) # [d]
+    sigma = np.cov(feats, rowvar=False) # [d, d]
+    return mu, sigma
+def frechet_distance(feats_fake: np.ndarray, feats_real: np.ndarray) -> float:
+    mu_gen, sigma_gen = compute_stats(feats_fake)
+    mu_real, sigma_real = compute_stats(feats_real)
+    m = np.square(mu_gen - mu_real).sum()
+    if feats_fake.shape[0]>1:
+        s, _ = sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member
+        fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+    else:
+        fid = np.real(m)
+    return float(fid)

common/fvd/styleganv/i3d_torchscript.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bec6519f66ea534e953026b4ae2c65553c17bf105611c746d904657e5860a5e2
+size 51235320

common/fvd/videogpt/fvd.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+import os
+import math
+import torch.nn.functional as F
+import numpy as np
+import einops
+def load_i3d_pretrained(device=torch.device('cpu')):
+    i3D_WEIGHTS_URL = "https://onedrive.live.com/download?cid=78EEF3EB6AE7DBCB&resid=78EEF3EB6AE7DBCB%21199&authkey=AApKdFHPXzWLNyI"
+    filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'i3d_pretrained_400.pt')
+    print(filepath)
+    if not os.path.exists(filepath):
+        print(f"preparing for download {i3D_WEIGHTS_URL}, you can download it by yourself.")
+        os.system(f"wget {i3D_WEIGHTS_URL} -O {filepath}")
+    from .pytorch_i3d import InceptionI3d
+    i3d = InceptionI3d(400, in_channels=3).eval().to(device)
+    i3d.load_state_dict(torch.load(filepath, map_location=device))
+    i3d = torch.nn.DataParallel(i3d)
+    return i3d
+def preprocess_single(video, resolution, sequence_length=None):
+    # video: THWC, {0, ..., 255}
+    video = video.permute(0, 3, 1, 2).float() / 255. # TCHW
+    t, c, h, w = video.shape
+    # temporal crop
+    if sequence_length is not None:
+        assert sequence_length <= t
+        video = video[:sequence_length]
+    # scale shorter side to resolution
+    scale = resolution / min(h, w)
+    if h < w:
+        target_size = (resolution, math.ceil(w * scale))
+    else:
+        target_size = (math.ceil(h * scale), resolution)
+    video = F.interpolate(video, size=target_size, mode='bilinear',
+                          align_corners=False)
+    # center crop
+    t, c, h, w = video.shape
+    w_start = (w - resolution) // 2
+    h_start = (h - resolution) // 2
+    video = video[:, :, h_start:h_start + resolution, w_start:w_start + resolution]
+    video = video.permute(1, 0, 2, 3).contiguous() # CTHW
+    video -= 0.5
+    return video
+def preprocess(videos, target_resolution=224):
+    # we should tras videos in [0-1] [b c t h w] as th.float
+    # -> videos in {0, ..., 255} [b t h w c] as np.uint8 array
+    videos = einops.rearrange(videos, 'b c t h w -> b t h w c')
+    videos = (videos*255).numpy().astype(np.uint8)
+    b, t, h, w, c = videos.shape
+    videos = torch.from_numpy(videos)
+    videos = torch.stack([preprocess_single(video, target_resolution) for video in videos])
+    return videos * 2 # [-0.5, 0.5] -> [-1, 1]
+def get_fvd_logits(videos, i3d, device, bs=10):
+    videos = preprocess(videos)
+    embeddings = get_logits(i3d, videos, device, bs=10)
+    return embeddings
+# https://github.com/tensorflow/gan/blob/de4b8da3853058ea380a6152bd3bd454013bf619/tensorflow_gan/python/eval/classifier_metrics.py#L161
+def _symmetric_matrix_square_root(mat, eps=1e-10):
+    u, s, v = torch.svd(mat)
+    si = torch.where(s < eps, s, torch.sqrt(s))
+    return torch.matmul(torch.matmul(u, torch.diag(si)), v.t())
+# https://github.com/tensorflow/gan/blob/de4b8da3853058ea380a6152bd3bd454013bf619/tensorflow_gan/python/eval/classifier_metrics.py#L400
+def trace_sqrt_product(sigma, sigma_v):
+    sqrt_sigma = _symmetric_matrix_square_root(sigma)
+    sqrt_a_sigmav_a = torch.matmul(sqrt_sigma, torch.matmul(sigma_v, sqrt_sigma))
+    return torch.trace(_symmetric_matrix_square_root(sqrt_a_sigmav_a))
+# https://discuss.pytorch.org/t/covariance-and-gradient-support/16217/2
+def cov(m, rowvar=False):
+    '''Estimate a covariance matrix given data.
+    Covariance indicates the level to which two variables vary together.
+    If we examine N-dimensional samples, `X = [x_1, x_2, ... x_N]^T`,
+    then the covariance matrix element `C_{ij}` is the covariance of
+    `x_i` and `x_j`. The element `C_{ii}` is the variance of `x_i`.
+    Args:
+        m: A 1-D or 2-D array containing multiple variables and observations.
+            Each row of `m` represents a variable, and each column a single
+            observation of all those variables.
+        rowvar: If `rowvar` is True, then each row represents a
+            variable, with observations in the columns. Otherwise, the
+            relationship is transposed: each column represents a variable,
+            while the rows contain observations.
+    Returns:
+        The covariance matrix of the variables.
+    '''
+    if m.dim() > 2:
+        raise ValueError('m has more than 2 dimensions')
+    if m.dim() < 2:
+        m = m.view(1, -1)
+    if not rowvar and m.size(0) != 1:
+        m = m.t()
+    fact = 1.0 / (m.size(1) - 1) # unbiased estimate
+    m -= torch.mean(m, dim=1, keepdim=True)
+    mt = m.t()  # if complex: mt = m.t().conj()
+    return fact * m.matmul(mt).squeeze()
+def frechet_distance(x1, x2):
+    x1 = x1.flatten(start_dim=1)
+    x2 = x2.flatten(start_dim=1)
+    m, m_w = x1.mean(dim=0), x2.mean(dim=0)
+    sigma, sigma_w = cov(x1, rowvar=False), cov(x2, rowvar=False)
+    mean = torch.sum((m - m_w) ** 2)
+    if x1.shape[0]>1:
+        sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)
+        trace = torch.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component
+        fd = trace + mean
+    else:
+        fd = np.real(mean)
+    return float(fd)
+def get_logits(i3d, videos, device, bs=10):
+    # assert videos.shape[0] % 16 == 0
+    with torch.no_grad():
+        logits = []
+        for i in range(0, videos.shape[0], bs):
+            batch = videos[i:i + bs].to(device)
+            # logits.append(i3d.module.extract_features(batch)) # wrong
+            logits.append(i3d(batch)) # right
+        logits = torch.cat(logits, dim=0)
+        return logits

common/fvd/videogpt/i3d_pretrained_400.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55095f049e706479d48e221adcdb145b2b9dc930ba28b081ed72367ffaa32343
+size 50939526

common/fvd/videogpt/pytorch_i3d.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Original code from https://github.com/piergiaj/pytorch-i3d
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+class MaxPool3dSamePadding(nn.MaxPool3d):
+    def compute_pad(self, dim, s):
+        if s % self.stride[dim] == 0:
+            return max(self.kernel_size[dim] - self.stride[dim], 0)
+        else:
+            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        out_t = np.ceil(float(t) / float(self.stride[0]))
+        out_h = np.ceil(float(h) / float(self.stride[1]))
+        out_w = np.ceil(float(w) / float(self.stride[2]))
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+        return super(MaxPool3dSamePadding, self).forward(x)
+class Unit3D(nn.Module):
+    def __init__(self, in_channels,
+                 output_channels,
+                 kernel_shape=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 padding=0,
+                 activation_fn=F.relu,
+                 use_batch_norm=True,
+                 use_bias=False,
+                 name='unit_3d'):
+        """Initializes Unit3D module."""
+        super(Unit3D, self).__init__()
+        self._output_channels = output_channels
+        self._kernel_shape = kernel_shape
+        self._stride = stride
+        self._use_batch_norm = use_batch_norm
+        self._activation_fn = activation_fn
+        self._use_bias = use_bias
+        self.name = name
+        self.padding = padding
+        self.conv3d = nn.Conv3d(in_channels=in_channels,
+                                out_channels=self._output_channels,
+                                kernel_size=self._kernel_shape,
+                                stride=self._stride,
+                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
+                                bias=self._use_bias)
+        if self._use_batch_norm:
+            self.bn = nn.BatchNorm3d(self._output_channels, eps=1e-5, momentum=0.001)
+    def compute_pad(self, dim, s):
+        if s % self._stride[dim] == 0:
+            return max(self._kernel_shape[dim] - self._stride[dim], 0)
+        else:
+            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        out_t = np.ceil(float(t) / float(self._stride[0]))
+        out_h = np.ceil(float(h) / float(self._stride[1]))
+        out_w = np.ceil(float(w) / float(self._stride[2]))
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+        x = self.conv3d(x)
+        if self._use_batch_norm:
+            x = self.bn(x)
+        if self._activation_fn is not None:
+            x = self._activation_fn(x)
+        return x
+class InceptionModule(nn.Module):
+    def __init__(self, in_channels, out_channels, name):
+        super(InceptionModule, self).__init__()
+        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
+                         name=name+'/Branch_0/Conv3d_0a_1x1')
+        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_1/Conv3d_0a_1x1')
+        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_1/Conv3d_0b_3x3')
+        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_2/Conv3d_0a_1x1')
+        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_2/Conv3d_0b_3x3')
+        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
+                                stride=(1, 1, 1), padding=0)
+        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_3/Conv3d_0b_1x1')
+        self.name = name
+    def forward(self, x):
+        b0 = self.b0(x)
+        b1 = self.b1b(self.b1a(x))
+        b2 = self.b2b(self.b2a(x))
+        b3 = self.b3b(self.b3a(x))
+        return torch.cat([b0,b1,b2,b3], dim=1)
+class InceptionI3d(nn.Module):
+    """Inception-v1 I3D architecture.
+    The model is introduced in:
+        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
+        Joao Carreira, Andrew Zisserman
+        https://arxiv.org/pdf/1705.07750v1.pdf.
+    See also the Inception architecture, introduced in:
+        Going deeper with convolutions
+        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+        http://arxiv.org/pdf/1409.4842v1.pdf.
+    """
+    # Endpoints of the model in order. During construction, all the endpoints up
+    # to a designated `final_endpoint` are returned in a dictionary as the
+    # second return value.
+    VALID_ENDPOINTS = (
+        'Conv3d_1a_7x7',
+        'MaxPool3d_2a_3x3',
+        'Conv3d_2b_1x1',
+        'Conv3d_2c_3x3',
+        'MaxPool3d_3a_3x3',
+        'Mixed_3b',
+        'Mixed_3c',
+        'MaxPool3d_4a_3x3',
+        'Mixed_4b',
+        'Mixed_4c',
+        'Mixed_4d',
+        'Mixed_4e',
+        'Mixed_4f',
+        'MaxPool3d_5a_2x2',
+        'Mixed_5b',
+        'Mixed_5c',
+        'Logits',
+        'Predictions',
+    )
+    def __init__(self, num_classes=400, spatial_squeeze=True,
+                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
+        """Initializes I3D model instance.
+        Args:
+          num_classes: The number of outputs in the logit layer (default 400, which
+              matches the Kinetics dataset).
+          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
+              before returning (default True).
+          final_endpoint: The model contains many possible endpoints.
+              `final_endpoint` specifies the last endpoint for the model to be built
+              up to. In addition to the output at `final_endpoint`, all the outputs
+              at endpoints up to `final_endpoint` will also be returned, in a
+              dictionary. `final_endpoint` must be one of
+              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
+          name: A string (optional). The name of this module.
+        Raises:
+          ValueError: if `final_endpoint` is not recognized.
+        """
+        if final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % final_endpoint)
+        super(InceptionI3d, self).__init__()
+        self._num_classes = num_classes
+        self._spatial_squeeze = spatial_squeeze
+        self._final_endpoint = final_endpoint
+        self.logits = None
+        if self._final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
+        self.end_points = {}
+        end_point = 'Conv3d_1a_7x7'
+        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
+                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_2a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Conv3d_2b_1x1'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Conv3d_2c_3x3'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_3a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_3b'
+        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_3c'
+        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_4a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4b'
+        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4c'
+        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4d'
+        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4e'
+        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_4f'
+        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'MaxPool3d_5a_2x2'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_5b'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Mixed_5c'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+        end_point = 'Logits'
+        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
+                                     stride=(1, 1, 1))
+        self.dropout = nn.Dropout(dropout_keep_prob)
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+        self.build()
+    def replace_logits(self, num_classes):
+        self._num_classes = num_classes
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+    def build(self):
+        for k in self.end_points.keys():
+            self.add_module(k, self.end_points[k])
+    def forward(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x) # use _modules to work with dataparallel
+        x = self.logits(self.dropout(self.avg_pool(x)))
+        if self._spatial_squeeze:
+            logits = x.squeeze(3).squeeze(3)
+        logits = logits.mean(dim=2)
+        # logits is batch X time X classes, which is what we want to work with
+        return logits
+    def extract_features(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x)
+        return self.avg_pool(x)

common/inception.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = "https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth"  # noqa: E501
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,  # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3,  # Final average pooling features
+    }
+    def __init__(
+        self,
+        output_blocks=(DEFAULT_BLOCK_INDEX,),
+        resize_input=True,
+        normalize_input=True,
+        requires_grad=False,
+        use_fid_inception=True,
+    ):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, "Last possible output block index is 3"
+        self.blocks = nn.ModuleList()
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = _inception_v3(weights="DEFAULT")
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2),
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1)),
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp
+def _inception_v3(*args, **kwargs):
+    """Wraps `torchvision.models.inception_v3`"""
+    try:
+        version = tuple(map(int, torchvision.__version__.split(".")[:2]))
+    except ValueError:
+        # Just a caution against weird version strings
+        version = (0,)
+    # Skips default weight inititialization if supported by torchvision
+    # version. See https://github.com/mseitzer/pytorch-fid/issues/28.
+    if version >= (0, 6):
+        kwargs["init_weights"] = False
+    # Backwards compatibility: `weights` argument was handled by `pretrained`
+    # argument prior to version 0.13.
+    if version < (0, 13) and "weights" in kwargs:
+        if kwargs["weights"] == "DEFAULT":
+            kwargs["pretrained"] = True
+        elif kwargs["weights"] is None:
+            kwargs["pretrained"] = False
+        else:
+            raise ValueError(
+                "weights=={} not supported in torchvision {}".format(
+                    kwargs["weights"], torchvision.__version__
+                )
+            )
+        del kwargs["weights"]
+    return torchvision.models.inception_v3(*args, **kwargs)
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    inception = _inception_v3(num_classes=1008, aux_logits=False, weights=None)
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    inception.load_state_dict(state_dict)
+    return inception
+class FIDInceptionA(torchvision.models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False
+        )
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionC(torchvision.models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False
+        )
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_1(torchvision.models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False
+        )
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_2(torchvision.models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)

common/plot/__init__.py ADDED Viewed

File without changes

common/plot/aggregated_output.csv ADDED Viewed

	@@ -0,0 +1,18 @@

+name,bridge_data_v2/teacher_force_psnr,bridge_data_v2/teacher_force_psnr_delta,bridge_data_v2/teacher_force_ssim,bridge_data_v2/teacher_force_pred_lpips,bridge_data_v2/teacher_force_loss,bridge_data_v2/num_examples,fractal20220817_data/teacher_force_psnr,fractal20220817_data/teacher_force_psnr_delta,fractal20220817_data/teacher_force_ssim,fractal20220817_data/teacher_force_pred_lpips,fractal20220817_data/teacher_force_loss,fractal20220817_data/num_examples,language_table/teacher_force_psnr,language_table/teacher_force_psnr_delta,language_table/teacher_force_ssim,language_table/teacher_force_pred_lpips,language_table/teacher_force_loss,language_table/num_examples,ucsd_pick_and_place_dataset_converted_externally_to_rlds/teacher_force_psnr,ucsd_pick_and_place_dataset_converted_externally_to_rlds/teacher_force_psnr_delta,ucsd_pick_and_place_dataset_converted_externally_to_rlds/teacher_force_ssim,ucsd_pick_and_place_dataset_converted_externally_to_rlds/teacher_force_pred_lpips,ucsd_pick_and_place_dataset_converted_externally_to_rlds/teacher_force_loss,ucsd_pick_and_place_dataset_converted_externally_to_rlds/num_examples,kaist_nonprehensile_converted_externally_to_rlds/teacher_force_psnr,kaist_nonprehensile_converted_externally_to_rlds/teacher_force_psnr_delta,kaist_nonprehensile_converted_externally_to_rlds/teacher_force_ssim,kaist_nonprehensile_converted_externally_to_rlds/teacher_force_pred_lpips,kaist_nonprehensile_converted_externally_to_rlds/teacher_force_loss,kaist_nonprehensile_converted_externally_to_rlds/num_examples,ucsd_kitchen_dataset_converted_externally_to_rlds/teacher_force_psnr,ucsd_kitchen_dataset_converted_externally_to_rlds/teacher_force_psnr_delta,ucsd_kitchen_dataset_converted_externally_to_rlds/teacher_force_ssim,ucsd_kitchen_dataset_converted_externally_to_rlds/teacher_force_pred_lpips,ucsd_kitchen_dataset_converted_externally_to_rlds/teacher_force_loss,ucsd_kitchen_dataset_converted_externally_to_rlds/num_examples,utokyo_xarm_bimanual_converted_externally_to_rlds/teacher_force_psnr,utokyo_xarm_bimanual_converted_externally_to_rlds/teacher_force_psnr_delta,utokyo_xarm_bimanual_converted_externally_to_rlds/teacher_force_ssim,utokyo_xarm_bimanual_converted_externally_to_rlds/teacher_force_pred_lpips,utokyo_xarm_bimanual_converted_externally_to_rlds/teacher_force_loss,utokyo_xarm_bimanual_converted_externally_to_rlds/num_examples,stanford_hydra_dataset_converted_externally_to_rlds/teacher_force_psnr,stanford_hydra_dataset_converted_externally_to_rlds/teacher_force_psnr_delta,stanford_hydra_dataset_converted_externally_to_rlds/teacher_force_ssim,stanford_hydra_dataset_converted_externally_to_rlds/teacher_force_pred_lpips,stanford_hydra_dataset_converted_externally_to_rlds/teacher_force_loss,stanford_hydra_dataset_converted_externally_to_rlds/num_examples,austin_sirius_dataset_converted_externally_to_rlds/teacher_force_psnr,austin_sirius_dataset_converted_externally_to_rlds/teacher_force_psnr_delta,austin_sirius_dataset_converted_externally_to_rlds/teacher_force_ssim,austin_sirius_dataset_converted_externally_to_rlds/teacher_force_pred_lpips,austin_sirius_dataset_converted_externally_to_rlds/teacher_force_loss,austin_sirius_dataset_converted_externally_to_rlds/num_examples,berkeley_fanuc_manipulation/teacher_force_psnr,berkeley_fanuc_manipulation/teacher_force_psnr_delta,berkeley_fanuc_manipulation/teacher_force_ssim,berkeley_fanuc_manipulation/teacher_force_pred_lpips,berkeley_fanuc_manipulation/teacher_force_loss,berkeley_fanuc_manipulation/num_examples,berkeley_mvp_converted_externally_to_rlds/teacher_force_psnr,berkeley_mvp_converted_externally_to_rlds/teacher_force_psnr_delta,berkeley_mvp_converted_externally_to_rlds/teacher_force_ssim,berkeley_mvp_converted_externally_to_rlds/teacher_force_pred_lpips,berkeley_mvp_converted_externally_to_rlds/teacher_force_loss,berkeley_mvp_converted_externally_to_rlds/num_examples,berkeley_rpt_converted_externally_to_rlds/teacher_force_psnr,berkeley_rpt_converted_externally_to_rlds/teacher_force_psnr_delta,berkeley_rpt_converted_externally_to_rlds/teacher_force_ssim,berkeley_rpt_converted_externally_to_rlds/teacher_force_pred_lpips,berkeley_rpt_converted_externally_to_rlds/teacher_force_loss,berkeley_rpt_converted_externally_to_rlds/num_examples,cmu_play_fusion/teacher_force_psnr,cmu_play_fusion/teacher_force_psnr_delta,cmu_play_fusion/teacher_force_ssim,cmu_play_fusion/teacher_force_pred_lpips,cmu_play_fusion/teacher_force_loss,cmu_play_fusion/num_examples,iamlab_cmu_pickup_insert_converted_externally_to_rlds/teacher_force_psnr,iamlab_cmu_pickup_insert_converted_externally_to_rlds/teacher_force_psnr_delta,iamlab_cmu_pickup_insert_converted_externally_to_rlds/teacher_force_ssim,iamlab_cmu_pickup_insert_converted_externally_to_rlds/teacher_force_pred_lpips,iamlab_cmu_pickup_insert_converted_externally_to_rlds/teacher_force_loss,iamlab_cmu_pickup_insert_converted_externally_to_rlds/num_examples,qut_dexterous_manpulation/teacher_force_psnr,qut_dexterous_manpulation/teacher_force_psnr_delta,qut_dexterous_manpulation/teacher_force_ssim,qut_dexterous_manpulation/teacher_force_pred_lpips,qut_dexterous_manpulation/teacher_force_loss,qut_dexterous_manpulation/num_examples,robo_net/teacher_force_psnr,robo_net/teacher_force_psnr_delta,robo_net/teacher_force_ssim,robo_net/teacher_force_pred_lpips,robo_net/teacher_force_loss,robo_net/num_examples,furniture_bench_dataset_converted_externally_to_rlds/teacher_force_psnr,furniture_bench_dataset_converted_externally_to_rlds/teacher_force_psnr_delta,furniture_bench_dataset_converted_externally_to_rlds/teacher_force_ssim,furniture_bench_dataset_converted_externally_to_rlds/teacher_force_pred_lpips,furniture_bench_dataset_converted_externally_to_rlds/teacher_force_loss,furniture_bench_dataset_converted_externally_to_rlds/num_examples,dlr_sara_grid_clamp_converted_externally_to_rlds/teacher_force_psnr,dlr_sara_grid_clamp_converted_externally_to_rlds/teacher_force_psnr_delta,dlr_sara_grid_clamp_converted_externally_to_rlds/teacher_force_ssim,dlr_sara_grid_clamp_converted_externally_to_rlds/teacher_force_pred_lpips,dlr_sara_grid_clamp_converted_externally_to_rlds/teacher_force_loss,dlr_sara_grid_clamp_converted_externally_to_rlds/num_examples,cmu_stretch/teacher_force_psnr,cmu_stretch/teacher_force_psnr_delta,cmu_stretch/teacher_force_ssim,cmu_stretch/teacher_force_pred_lpips,cmu_stretch/teacher_force_loss,cmu_stretch/num_examples,spoc/teacher_force_psnr,spoc/teacher_force_psnr_delta,spoc/teacher_force_ssim,spoc/teacher_force_pred_lpips,spoc/teacher_force_loss,spoc/num_examples,columbia_cairlab_pusht_real/teacher_force_psnr,columbia_cairlab_pusht_real/teacher_force_psnr_delta,columbia_cairlab_pusht_real/teacher_force_ssim,columbia_cairlab_pusht_real/teacher_force_pred_lpips,columbia_cairlab_pusht_real/teacher_force_loss,columbia_cairlab_pusht_real/num_examples,droid/teacher_force_psnr,droid/teacher_force_psnr_delta,droid/teacher_force_ssim,droid/teacher_force_pred_lpips,droid/teacher_force_loss,droid/num_examples,toto/teacher_force_psnr,toto/teacher_force_psnr_delta,toto/teacher_force_ssim,toto/teacher_force_pred_lpips,toto/teacher_force_loss,toto/num_examples,io_ai_tech/teacher_force_psnr,io_ai_tech/teacher_force_psnr_delta,io_ai_tech/teacher_force_ssim,io_ai_tech/teacher_force_pred_lpips,io_ai_tech/teacher_force_loss,io_ai_tech/num_examples,conq_hose_manipulation/teacher_force_psnr,conq_hose_manipulation/teacher_force_psnr_delta,conq_hose_manipulation/teacher_force_ssim,conq_hose_manipulation/teacher_force_pred_lpips,conq_hose_manipulation/teacher_force_loss,conq_hose_manipulation/num_examples,dobbe/teacher_force_psnr,dobbe/teacher_force_psnr_delta,dobbe/teacher_force_ssim,dobbe/teacher_force_pred_lpips,dobbe/teacher_force_loss,dobbe/num_examples,berkeley_gnm_cory_hall/teacher_force_psnr,berkeley_gnm_cory_hall/teacher_force_psnr_delta,berkeley_gnm_cory_hall/teacher_force_ssim,berkeley_gnm_cory_hall/teacher_force_pred_lpips,berkeley_gnm_cory_hall/teacher_force_loss,berkeley_gnm_cory_hall/num_examples,plex_robosuite/teacher_force_psnr,plex_robosuite/teacher_force_psnr_delta,plex_robosuite/teacher_force_ssim,plex_robosuite/teacher_force_pred_lpips,plex_robosuite/teacher_force_loss,plex_robosuite/num_examples,usc_cloth_sim_converted_externally_to_rlds/teacher_force_psnr,usc_cloth_sim_converted_externally_to_rlds/teacher_force_psnr_delta,usc_cloth_sim_converted_externally_to_rlds/teacher_force_ssim,usc_cloth_sim_converted_externally_to_rlds/teacher_force_pred_lpips,usc_cloth_sim_converted_externally_to_rlds/teacher_force_loss,usc_cloth_sim_converted_externally_to_rlds/num_examples,berkeley_cable_routing/teacher_force_psnr,berkeley_cable_routing/teacher_force_psnr_delta,berkeley_cable_routing/teacher_force_ssim,berkeley_cable_routing/teacher_force_pred_lpips,berkeley_cable_routing/teacher_force_loss,berkeley_cable_routing/num_examples,imperial_wrist_dataset/teacher_force_psnr,imperial_wrist_dataset/teacher_force_psnr_delta,imperial_wrist_dataset/teacher_force_ssim,imperial_wrist_dataset/teacher_force_pred_lpips,imperial_wrist_dataset/teacher_force_loss,imperial_wrist_dataset/num_examples,bc_z/teacher_force_psnr,bc_z/teacher_force_psnr_delta,bc_z/teacher_force_ssim,bc_z/teacher_force_pred_lpips,bc_z/teacher_force_loss,bc_z/num_examples,kuka/teacher_force_psnr,kuka/teacher_force_psnr_delta,kuka/teacher_force_ssim,kuka/teacher_force_pred_lpips,kuka/teacher_force_loss,kuka/num_examples,roboturk/teacher_force_psnr,roboturk/teacher_force_psnr_delta,roboturk/teacher_force_ssim,roboturk/teacher_force_pred_lpips,roboturk/teacher_force_loss,roboturk/num_examples,metaworld/teacher_force_psnr,metaworld/teacher_force_psnr_delta,metaworld/teacher_force_ssim,metaworld/teacher_force_pred_lpips,metaworld/teacher_force_loss,metaworld/num_examples,robomimic/teacher_force_psnr,robomimic/teacher_force_psnr_delta,robomimic/teacher_force_ssim,robomimic/teacher_force_pred_lpips,robomimic/teacher_force_loss,robomimic/num_examples,epic_kitchen/teacher_force_psnr,epic_kitchen/teacher_force_psnr_delta,epic_kitchen/teacher_force_ssim,epic_kitchen/teacher_force_pred_lpips,epic_kitchen/teacher_force_loss,epic_kitchen/num_examples,ego4d/teacher_force_psnr,ego4d/teacher_force_psnr_delta,ego4d/teacher_force_ssim,ego4d/teacher_force_pred_lpips,ego4d/teacher_force_loss,ego4d/num_examples,nyu_door_opening_surprising_effectiveness/teacher_force_psnr,nyu_door_opening_surprising_effectiveness/teacher_force_psnr_delta,nyu_door_opening_surprising_effectiveness/teacher_force_ssim,nyu_door_opening_surprising_effectiveness/teacher_force_pred_lpips,nyu_door_opening_surprising_effectiveness/teacher_force_loss,nyu_door_opening_surprising_effectiveness/num_examples
+20.11654281616211,0.2406988888978958,0.6522064805030823,0.16225013136863708,4.879761219024658,500,22.18416404724121,0.35506966710090637,0.6869667768478394,0.15024960041046143,4.770835876464844,500,21.715396881103516,0.3029274344444275,0.6992139220237732,0.16471771895885468,5.294665813446045,198,21.16069793701172,0.8068287372589111,0.856053352355957,0.13014769554138184,6.175273895263672,268,20.990825653076172,-0.10340484231710434,0.7044205069541931,0.18493501842021942,7.725522518157959,215,15.698708534240723,0.0012377961538732052,0.5898451209068298,0.17795829474925995,5.001875400543213,44,19.282543182373047,0.25010883808135986,0.6505519151687622,0.17733542621135712,5.174380779266357,26,17.12480926513672,-0.2587044835090637,0.705141007900238,0.19221702218055725,5.820067882537842,500,19.765209197998047,0.520576000213623,0.7926568984985352,0.1714717447757721,6.499162673950195,500,24.512998580932617,0.05698919668793678,0.7914198637008667,0.10842812061309814,3.0430972576141357,197,25.59234619140625,0.10779287666082382,0.8336195349693298,0.19609792530536652,9.673583984375,145,19.04483985900879,-1.549579381942749,0.8432617783546448,0.23590296506881714,9.34917163848877,500,25.48477554321289,-0.2415604293346405,0.808574378490448,0.14057636260986328,3.712907552719116,500,20.40241241455078,-1.7759240865707397,0.695418119430542,0.18546271324157715,5.312252044677734,500,18.150922775268555,0.07440949231386185,0.623611569404602,0.19956742227077484,2.496238946914673,500,19.270723342895508,0.13051848113536835,0.6409730315208435,0.21255843341350555,5.373872756958008,400,14.527210235595703,-1.1576420068740845,0.6855177283287048,0.2406100034713745,6.9484028816223145,500,22.118375778198242,0.12427309900522232,0.760040819644928,0.13304449617862701,2.0709831714630127,142,27.101343154907227,0.002104964340105653,0.8569799661636353,0.11864025890827179,4.056473731994629,304,14.499595642089844,-2.287384033203125,0.6487097144126892,0.5801433324813843,11.471040725708008,500,19.862470626831055,0.004642155021429062,0.8116016387939453,0.15320290625095367,7.407879829406738,290,18.82098960876465,1.036180019378662,0.6922352910041809,0.18524041771888733,5.3734941482543945,500,19.008501052856445,0.08843769133090973,0.654740035533905,0.19600719213485718,5.878746032714844,500,25.3173885345459,-1.4838885068893433,0.8376902341842651,0.091035857796669,6.641345977783203,500,0,0,0,0,0,0,23.15846061706543,0.19177615642547607,0.7468024492263794,0.14149916172027588,7.459285259246826,500,20.05571746826172,-0.6787086129188538,0.8033839464187622,0.23725976049900055,9.829147338867188,185,25.470836639404297,-0.04034167900681496,0.8336919546127319,0.13884975016117096,2.244248867034912,307,27.915611267089844,2.5503063201904297,0.9527876377105713,0.09793127328157425,1.2040963172912598,200,20.912508010864258,0.591270387172699,0.8214857578277588,0.2098020762205124,4.666227340698242,3,23.71303939819336,-0.19690543413162231,0.6411390900611877,0.09051118791103363,8.156048774719238,58,23.800819396972656,0.357938289642334,0.7015560269355774,0.16095396876335144,5.172791004180908,500,19.602933883666992,0.08206112682819366,0.6492921710014343,0.20972613990306854,5.722141265869141,136,15.88235092163086,0.034370679408311844,0.6604549288749695,0.26275190711021423,7.8473687171936035,475,0,0,0,0,0,0,20.04568099975586,-0.6712338328361511,0.8586370944976807,0.14845611155033112,4.173150062561035,60,0,0,0,0,0,0,0,0,0,0,0,0,16.780973434448242,0.36480912566185,0.5418305993080139,0.23359735310077667,9.743389129638672,63,0
+20.11654281616211,0.2406988888978958,0.6522064805030823,0.16225013136863708,4.879761219024658,500,22.18416404724121,0.35506966710090637,0.6869667768478394,0.15024960041046143,4.770835876464844,500,21.715396881103516,0.3029274344444275,0.6992139220237732,0.16471771895885468,5.294665813446045,198,21.16069793701172,0.8068287372589111,0.856053352355957,0.13014769554138184,6.175273895263672,268,20.990825653076172,-0.10340484231710434,0.7044205069541931,0.18493501842021942,7.725522518157959,215,15.698708534240723,0.0012377961538732052,0.5898451209068298,0.17795829474925995,5.001875400543213,44,19.282543182373047,0.25010883808135986,0.6505519151687622,0.17733542621135712,5.174380779266357,26,17.12480926513672,-0.2587044835090637,0.705141007900238,0.19221702218055725,5.820067882537842,500,19.765209197998047,0.520576000213623,0.7926568984985352,0.1714717447757721,6.499162673950195,500,24.512998580932617,0.05698919668793678,0.7914198637008667,0.10842812061309814,3.0430972576141357,197,25.59234619140625,0.10779287666082382,0.8336195349693298,0.19609792530536652,9.673583984375,145,19.04483985900879,-1.549579381942749,0.8432617783546448,0.23590296506881714,9.34917163848877,500,25.48477554321289,-0.2415604293346405,0.808574378490448,0.14057636260986328,3.712907552719116,500,20.40241241455078,-1.7759240865707397,0.695418119430542,0.18546271324157715,5.312252044677734,500,18.150922775268555,0.07440949231386185,0.623611569404602,0.19956742227077484,2.496238946914673,500,19.270723342895508,0.13051848113536835,0.6409730315208435,0.21255843341350555,5.373872756958008,400,14.527210235595703,-1.1576420068740845,0.6855177283287048,0.2406100034713745,6.9484028816223145,500,22.118375778198242,0.12427309900522232,0.760040819644928,0.13304449617862701,2.0709831714630127,142,27.101343154907227,0.002104964340105653,0.8569799661636353,0.11864025890827179,4.056473731994629,304,14.499595642089844,-2.287384033203125,0.6487097144126892,0.5801433324813843,11.471040725708008,500,19.862470626831055,0.004642155021429062,0.8116016387939453,0.15320290625095367,7.407879829406738,290,18.82098960876465,1.036180019378662,0.6922352910041809,0.18524041771888733,5.3734941482543945,500,19.008501052856445,0.08843769133090973,0.654740035533905,0.19600719213485718,5.878746032714844,500,25.3173885345459,-1.4838885068893433,0.8376902341842651,0.091035857796669,6.641345977783203,500,0,0,0,0,0,0,23.15846061706543,0.19177615642547607,0.7468024492263794,0.14149916172027588,7.459285259246826,500,20.05571746826172,-0.6787086129188538,0.8033839464187622,0.23725976049900055,9.829147338867188,185,25.470836639404297,-0.04034167900681496,0.8336919546127319,0.13884975016117096,2.244248867034912,307,27.915611267089844,2.5503063201904297,0.9527876377105713,0.09793127328157425,1.2040963172912598,200,20.912508010864258,0.591270387172699,0.8214857578277588,0.2098020762205124,4.666227340698242,3,23.71303939819336,-0.19690543413162231,0.6411390900611877,0.09051118791103363,8.156048774719238,58,23.800819396972656,0.357938289642334,0.7015560269355774,0.16095396876335144,5.172791004180908,500,19.602933883666992,0.08206112682819366,0.6492921710014343,0.20972613990306854,5.722141265869141,136,15.88235092163086,0.034370679408311844,0.6604549288749695,0.26275190711021423,7.8473687171936035,475,0,0,0,0,0,0,20.04568099975586,-0.6712338328361511,0.8586370944976807,0.14845611155033112,4.173150062561035,60,0,0,0,0,0,0,0,0,0,0,0,0,16.780973434448242,0.36480912566185,0.5418305993080139,0.23359735310077667,9.743389129638672,63,0
+20.22026252746582,0.38262513279914856,0.6533553600311279,0.1599833220243454,4.625724792480469,500,22.20106315612793,1.1525933742523193,0.6865711212158203,0.1498977541923523,4.709808826446533,500,21.663299560546875,0.315200537443161,0.6973788142204285,0.16729794442653656,5.099008083343506,198,21.049013137817383,1.241530179977417,0.8553103804588318,0.1382942497730255,6.023053169250488,268,20.187990188598633,0.2288614809513092,0.6624469757080078,0.21102775633335114,8.721923828125,215,14.019247055053711,0.1637289673089981,0.5697340369224548,0.2109413594007492,5.538760662078857,44,19.34811782836914,1.7672139406204224,0.65287184715271,0.17289860546588898,4.801314830780029,26,17.78873634338379,0.7138077020645142,0.7086768746376038,0.17418105900287628,5.320345878601074,500,19.624094009399414,0.8938031792640686,0.7965993285179138,0.19896067678928375,6.5370965003967285,500,24.533777236938477,0.036842938512563705,0.7917919754981995,0.10872980952262878,2.8953208923339844,197,25.017925262451172,0.39651936292648315,0.8296465873718262,0.19340361654758453,9.546339988708496,145,19.407617568969727,0.20671948790550232,0.8511928915977478,0.21369731426239014,8.630515098571777,500,25.74650001525879,0.004801941104233265,0.8135946989059448,0.13683146238327026,3.3414487838745117,500,18.16292953491211,-1.5063064098358154,0.6798076033592224,0.2016068398952484,4.924650192260742,500,17.171728134155273,0.12000428140163422,0.6030166149139404,0.21893559396266937,2.7387208938598633,200,19.084640502929688,0.17159269750118256,0.6400240659713745,0.21725738048553467,5.156170845031738,200,14.959861755371094,-0.08570398390293121,0.6782578825950623,0.24334610998630524,6.023822784423828,200,20.60854148864746,1.239460825920105,0.7399578094482422,0.15486611425876617,2.6809659004211426,142,27.038379669189453,-0.04835420474410057,0.8582215905189514,0.11914262175559998,3.703599214553833,200,16.528011322021484,0.07733757048845291,0.6863109469413757,0.43281883001327515,10.088181495666504,200,20.27916717529297,0.39283424615859985,0.8197209239006042,0.1518518626689911,7.038107872009277,200,19.77338218688965,-0.10181392729282379,0.7324086427688599,0.1659238040447235,5.009759426116943,200,16.711793899536133,-0.7416815161705017,0.5757627487182617,0.25752225518226624,7.026858329772949,200,26.67308807373047,-0.19710086286067963,0.849867582321167,0.08147656917572021,6.09824275970459,200,0,0,0,0,0,0,22.918180465698242,0.21646614372730255,0.7544977068901062,0.13648433983325958,6.675195693969727,200,20.61359977722168,-0.2607249617576599,0.8171404600143433,0.2431134730577469,9.399118423461914,185,25.659948348999023,-0.007021207828074694,0.8343022465705872,0.13590268790721893,2.109799385070801,200,28.551301956176758,0.999920666217804,0.9539777040481567,0.09078202396631241,1.1626181602478027,200,20.705917358398438,0.19353322684764862,0.822070300579071,0.2162921279668808,4.443334579467773,3,23.890663146972656,0.042181383818387985,0.6438125371932983,0.09075239300727844,7.78416109085083,58,23.974327087402344,0.4367068409919739,0.6935603022575378,0.15629488229751587,4.970770835876465,200,19.62123680114746,0.2064054161310196,0.650097131729126,0.21027418971061707,5.509530067443848,136,15.70105266571045,-0.008274257183074951,0.6658218502998352,0.26436904072761536,7.878448009490967,200,0,0,0,0,0,0,21.392261505126953,0.8078638315200806,0.8658612966537476,0.1169213280081749,4.069273471832275,60,0,0,0,0,0,0,0,0,0,0,0,0,16.521770477294922,0.4119645357131958,0.5416176319122314,0.2529039680957794,9.572311401367188,63,0
+20.192195892333984,0.0013111135922372341,0.6530546545982361,0.16100579500198364,4.674829006195068,500,22.106380462646484,0.002372157759964466,0.6869567632675171,0.14720509946346283,4.874122619628906,500,21.668710708618164,0.010731762275099754,0.6994706392288208,0.1702597439289093,5.343525409698486,198,20.726055145263672,0.004278174135833979,0.8526590466499329,0.13251639902591705,6.267956256866455,268,20.887666702270508,0.3664732277393341,0.6933053135871887,0.18257129192352295,7.749646186828613,200,15.50479507446289,0.011554110795259476,0.5916057229042053,0.18086765706539154,5.101010799407959,44,19.26924705505371,-0.0402817502617836,0.6491515636444092,0.17109069228172302,5.088503360748291,26,18.38506317138672,-0.01206012349575758,0.7032961249351501,0.17353220283985138,4.688672065734863,200,19.471759796142578,-0.0025741406716406345,0.790705680847168,0.18272539973258972,6.433710098266602,200,24.408714294433594,-0.006203812547028065,0.7900201678276062,0.1093222051858902,3.0680618286132812,197,25.618268966674805,-0.012119447812438011,0.8320527672767639,0.189222514629364,9.541532516479492,145,20.02849769592285,0.0040948037058115005,0.8293033838272095,0.22746650874614716,8.622618675231934,200,25.799362182617188,0.002677352400496602,0.8147953748703003,0.1341477930545807,3.4496517181396484,200,22.345197677612305,0.27116137742996216,0.7258548736572266,0.1409902721643448,3.6172609329223633,200,16.546030044555664,-0.10674090683460236,0.5869951844215393,0.23679934442043304,3.374926805496216,200,19.099502563476562,-0.007533765863627195,0.6395227909088135,0.21552789211273193,5.256075382232666,200,16.655963897705078,0.0018004697049036622,0.6986208558082581,0.1904933899641037,5.870639801025391,200,22.06020736694336,0.564301609992981,0.7581813931465149,0.13439540565013885,3.1246254444122314,142,27.067359924316406,0.00568796694278717,0.857913076877594,0.11922232806682587,4.030396461486816,200,16.69654655456543,0.013287489302456379,0.6759082078933716,0.41035643219947815,10.485578536987305,200,20.1484317779541,-0.0037386345211416483,0.8214008808135986,0.1595878303050995,7.100029945373535,200,19.99980926513672,0.002995690330862999,0.7338225841522217,0.16079355776309967,5.090161323547363,200,17.28361701965332,-0.0767323225736618,0.5945534706115723,0.23762698471546173,6.790170192718506,200,26.707612991333008,-0.010888484306633472,0.8517170548439026,0.08222655206918716,6.144393444061279,200,0,0,0,0,0,0,22.81102752685547,0.004911952186375856,0.753410816192627,0.13840247690677643,6.590641021728516,200,20.56325340270996,-0.0036200578324496746,0.8122907876968384,0.2328769862651825,9.606425285339355,185,25.634384155273438,0.014794806018471718,0.8339281678199768,0.13365812599658966,2.2014026641845703,200,28.079368591308594,-0.27046048641204834,0.9531135559082031,0.09380464255809784,1.130263090133667,200,20.635210037231445,0.014526949264109135,0.8181746006011963,0.21357515454292297,4.678038597106934,3,23.946765899658203,0.026584567502141,0.6427099704742432,0.09646405279636383,8.022929191589355,58,23.835311889648438,-0.00212214607745409,0.6980120539665222,0.15597480535507202,5.109379768371582,200,19.57789421081543,-0.008618982508778572,0.6491334438323975,0.21285711228847504,5.622872829437256,136,15.693717956542969,-0.0042601898312568665,0.6714515686035156,0.2660149037837982,7.933395862579346,200,0,0,0,0,0,0,20.719491958618164,0.012509683147072792,0.8595790266990662,0.11407399922609329,3.9665277004241943,60,0,0,0,0,0,0,0,0,0,0,0,0,16.573131561279297,-0.02216508239507675,0.5317869782447815,0.25623247027397156,9.647202491760254,63,0
+20.11272621154785,0.2109360694885254,0.6520361304283142,0.1624097228050232,4.9084296226501465,500,22.162460327148438,0.5982567667961121,0.6869907379150391,0.14949150383472443,4.766613483428955,500,21.70660972595215,0.24707387387752533,0.6991060376167297,0.16490808129310608,5.31504487991333,198,20.89645767211914,0.6825409531593323,0.8557612299919128,0.13019102811813354,6.398017406463623,200,21.12166976928711,0.08367721736431122,0.7037967443466187,0.1830134242773056,7.72568416595459,200,15.728557586669922,0.023094290867447853,0.5895631909370422,0.17670421302318573,4.983802318572998,44,19.269149780273438,0.21575377881526947,0.6504108905792236,0.17488287389278412,5.12293004989624,26,17.66543197631836,-0.3417667746543884,0.6988275647163391,0.1885838657617569,5.298708438873291,200,19.65871810913086,0.5588710904121399,0.7912994027137756,0.17227208614349365,6.4804582595825195,200,24.516550064086914,0.020750248804688454,0.7902164459228516,0.10835054516792297,3.0472919940948486,197,25.76390266418457,0.2111617475748062,0.8347102999687195,0.1926291286945343,9.65595817565918,145,18.151044845581055,-1.411789894104004,0.7994647026062012,0.26881223917007446,9.587594032287598,200,25.545425415039062,-0.2518838047981262,0.8101628422737122,0.13834546506404877,3.7476205825805664,200,20.21263313293457,-2.1581101417541504,0.6755650043487549,0.20787520706653595,5.349665641784668,200,17.979320526123047,0.029456928372383118,0.6195611953735352,0.2056853473186493,2.5060770511627197,200,19.21215057373047,0.09863721579313278,0.6396337747573853,0.21282075345516205,5.47512149810791,200,14.731433868408203,-1.0437582731246948,0.6715344190597534,0.24017825722694397,6.835475444793701,200,22.121034622192383,0.13931676745414734,0.7599001526832581,0.13316860795021057,2.0559332370758057,142,26.967485427856445,-0.04352593049407005,0.8562372326850891,0.11917727440595627,4.126804828643799,200,14.093514442443848,-2.401867628097534,0.6462867259979248,0.5737171769142151,11.239895820617676,200,20.1369686126709,0.03518101945519447,0.8176986575126648,0.15042880177497864,7.3873419761657715,200,19.936338424682617,1.2820953130722046,0.7327397465705872,0.16244173049926758,5.061605930328369,200,18.608089447021484,0.03232511132955551,0.6288440823554993,0.19996103644371033,6.023675918579102,200,25.265308380126953,-1.5252397060394287,0.8366735577583313,0.09203425794839859,6.713019847869873,200,0,0,0,0,0,0,22.606639862060547,0.23754052817821503,0.7469227313995361,0.14175154268741608,7.264929294586182,200,20.01392364501953,-0.7243794202804565,0.8044877052307129,0.24030713737010956,9.7893705368042,185,25.513294219970703,0.022836336866021156,0.8328461647033691,0.13896547257900238,2.2256522178649902,200,27.747732162475586,2.1492269039154053,0.9519210457801819,0.09740724414587021,1.2134103775024414,200,20.910673141479492,0.47492536902427673,0.8211309909820557,0.21091899275779724,4.660539150238037,3,23.682945251464844,-0.09257392585277557,0.6411334872245789,0.09143876284360886,8.146902084350586,58,23.916791915893555,0.2757120430469513,0.6961292028427124,0.15634751319885254,5.073945999145508,200,19.578807830810547,0.06890798360109329,0.648792028427124,0.2106357365846634,5.744231224060059,136,15.669443130493164,0.014397966675460339,0.662750780582428,0.26728084683418274,8.170258522033691,200,0,0,0,0,0,0,20.507707595825195,-0.1476416438817978,0.8596673011779785,0.13412821292877197,4.167920112609863,60,0,0,0,0,0,0,0,0,0,0,0,0,16.79804229736328,0.39872896671295166,0.5435536503791809,0.23285678029060364,9.751370429992676,63,0
+20.09503936767578,0.19324186444282532,0.6532765030860901,0.16777116060256958,4.9215545654296875,200,21.670238494873047,0.5962230563163757,0.6772172451019287,0.15711373090744019,4.973745822906494,200,21.32579231262207,0.23807521164417267,0.6979929804801941,0.17777009308338165,5.471674919128418,198,19.964054107666016,0.6898609399795532,0.8480191826820374,0.14881540834903717,6.650026798248291,200,19.73797035217285,0.028328483924269676,0.6416213512420654,0.2283506691455841,9.225159645080566,200,14.12078857421875,-0.07745029032230377,0.5660436749458313,0.21174412965774536,5.737281322479248,44,17.832124710083008,-0.6108675003051758,0.625728189945221,0.2204168289899826,5.574310302734375,26,17.75128746032715,0.5052798986434937,0.6958627700805664,0.1850184202194214,5.1611528396606445,200,18.28720474243164,3.523144483566284,0.77419114112854,0.22147363424301147,6.997479438781738,200,24.396211624145508,0.042355746030807495,0.7892429828643799,0.11021917313337326,3.1949589252471924,197,24.727394104003906,0.08382178097963333,0.831762433052063,0.21840521693229675,9.826996803283691,145,15.711186408996582,-1.1402037143707275,0.7454285621643066,0.4373578727245331,11.309600830078125,200,25.52533721923828,-0.00760778971016407,0.8099644184112549,0.13890613615512848,3.6684834957122803,200,19.276363372802734,-2.6745126247406006,0.6672981381416321,0.21036404371261597,4.968836784362793,200,15.376065254211426,0.2176371067762375,0.5345624089241028,0.28238123655319214,4.76574182510376,200,18.881689071655273,0.1820032000541687,0.6362051367759705,0.22268341481685638,5.450001239776611,200,15.536110877990723,0.5788084268569946,0.6868603229522705,0.2221454679965973,6.200928211212158,200,18.588090896606445,-0.1351645439863205,0.6845595240592957,0.20317442715168,4.309319496154785,142,26.693002700805664,-0.017797470092773438,0.85491943359375,0.12290634214878082,4.107079982757568,200,15.977954864501953,-0.2343152016401291,0.6691921949386597,0.4571930468082428,10.901474952697754,200,19.851110458374023,0.4773397743701935,0.8219828605651855,0.17668022215366364,7.3162055015563965,200,19.206226348876953,0.6738173961639404,0.7266687750816345,0.17411881685256958,5.285086631774902,200,15.480487823486328,-0.05200222134590149,0.5209740400314331,0.30362191796302795,8.292901992797852,200,25.806957244873047,-0.5103187561035156,0.8443582653999329,0.08975253254175186,6.479008197784424,200,0,0,0,0,0,0,21.907487869262695,-0.3219781517982483,0.7322761416435242,0.15371987223625183,7.148579120635986,200,19.59992027282715,-0.06399036198854446,0.8096731305122375,0.2715202569961548,9.681106567382812,185,25.1120662689209,-0.05029723793268204,0.829463541507721,0.14083868265151978,2.2757794857025146,200,28.31503677368164,1.267438292503357,0.9527142643928528,0.0927368700504303,1.3062008619308472,200,20.59347152709961,0.03865854814648628,0.8206701874732971,0.2205837219953537,4.7697625160217285,3,23.66497230529785,0.0852617546916008,0.6408542990684509,0.10270028561353683,8.262017250061035,58,23.514087677001953,0.18420732021331787,0.6972275972366333,0.1637846827507019,5.285017490386963,200,19.399612426757812,0.14159218966960907,0.6437479257583618,0.2207319140434265,6.007476806640625,136,15.523843765258789,0.04549547657370567,0.6679399609565735,0.2731015682220459,8.212705612182617,200,0,0,0,0,0,0,18.983938217163086,-0.7280352711677551,0.8515005111694336,0.16164684295654297,4.466729640960693,60,0,0,0,0,0,0,0,0,0,0,0,0,16.20790672302246,0.4906918406486511,0.5336827039718628,0.28068792819976807,10.018815994262695,63,0
+18.422204971313477,-0.12648740410804749,0.6362218260765076,0.19450049102306366,5.315305233001709,500,20.882783889770508,-0.07433691620826721,0.6690763831138611,0.1761208325624466,5.387838363647461,500,21.112810134887695,-0.03544004634022713,0.699590802192688,0.18499258160591125,5.969436168670654,198,18.350582122802734,0.12320471554994583,0.8303707838058472,0.1668887436389923,6.814785480499268,268,20.747133255004883,-0.008221889846026897,0.6795586943626404,0.18530716001987457,8.013651847839355,215,13.65688705444336,-0.014612981118261814,0.5605658888816833,0.21683458983898163,5.646177291870117,44,17.740320205688477,-0.01435495913028717,0.6188032031059265,0.20206505060195923,5.870429039001465,26,16.441152572631836,-0.09573374688625336,0.6951659917831421,0.20139959454536438,5.628950119018555,500,18.033527374267578,-0.049468427896499634,0.7758861184120178,0.20358926057815552,7.416662693023682,500,23.77448272705078,-0.004841374699026346,0.7814860939979553,0.11732590198516846,3.6382157802581787,197,23.89436149597168,-0.07056951522827148,0.8185535073280334,0.22474637627601624,10.225882530212402,145,19.539384841918945,-0.2427220195531845,0.8561981320381165,0.23518529534339905,9.096972465515137,500,24.909347534179688,-0.04073095694184303,0.8004875779151917,0.146201491355896,4.055650234222412,500,21.47697639465332,-0.03675007075071335,0.7250679135322571,0.1535453498363495,4.3143134117126465,500,18.370075225830078,-0.002068187575787306,0.6328396201133728,0.19349025189876556,2.4523301124572754,500,18.51810073852539,-0.029367070645093918,0.6309530138969421,0.23161163926124573,5.59820556640625,400,15.171514511108398,-0.08596291393041611,0.6846583485603333,0.2208338975906372,6.640722751617432,500,22.067739486694336,-0.010470133274793625,0.759013831615448,0.13397353887557983,2.6300439834594727,142,26.65119743347168,-0.02217714861035347,0.8541759252548218,0.12250284105539322,4.489205837249756,304,16.662843704223633,0.030067602172493935,0.6663406491279602,0.45978274941444397,10.977742195129395,500,18.961740493774414,-0.03515905141830444,0.8050862550735474,0.18823504447937012,8.212032318115234,290,17.795196533203125,-0.022556299343705177,0.6786172986030579,0.21285778284072876,5.880309581756592,500,18.647926330566406,-0.020784933120012283,0.6541447043418884,0.21047918498516083,6.204220771789551,500,25.266008377075195,-0.13009877502918243,0.8358787298202515,0.09196046739816666,6.571922779083252,500,0,0,0,0,0,0,21.6362247467041,0.043379079550504684,0.7133622169494629,0.16810670495033264,7.582168102264404,500,19.400468826293945,-0.06924106180667877,0.7891562581062317,0.2562311589717865,10.40406322479248,185,24.272430419921875,0.002408565254881978,0.822277307510376,0.14490512013435364,3.1252973079681396,307,21.627735137939453,0.28700390458106995,0.9307183623313904,0.15895913541316986,3.7723772525787354,200,20.32921028137207,0.07073872536420822,0.8144434094429016,0.21742278337478638,5.295686721801758,3,23.026090621948242,-0.008077435195446014,0.63054358959198,0.1302529126405716,8.783021926879883,58,22.135250091552734,-0.020904889330267906,0.6749143004417419,0.18395783007144928,5.7703423500061035,500,18.66996192932129,-0.03733401745557785,0.6296820640563965,0.2478480488061905,6.311908721923828,136,15.479337692260742,-0.004108104854822159,0.6553525328636169,0.28199443221092224,8.236729621887207,475,0,0,0,0,0,0,18.864093780517578,0.1739964336156845,0.8528752326965332,0.1548851728439331,5.3060221672058105,60,0,0,0,0,0,0,0,0,0,0,0,0,15.678476333618164,0.038419850170612335,0.5233398675918579,0.3035353720188141,10.510383605957031,63,0
+18.889339447021484,-0.1648501455783844,0.6437658667564392,0.1826857179403305,5.060956954956055,500,21.38709831237793,0.1416708528995514,0.6777622103691101,0.16441290080547333,5.186842441558838,500,21.354217529296875,0.0018506277119740844,0.7000029683113098,0.17723077535629272,5.897092342376709,198,19.61304473876953,0.037835195660591125,0.8433040380477905,0.14655180275440216,6.7373223304748535,268,20.715747833251953,0.01130291074514389,0.678013265132904,0.17803651094436646,8.016233444213867,215,15.14285659790039,0.26080322265625,0.5890591144561768,0.18768545985221863,5.359984874725342,44,18.543058395385742,-0.08016975224018097,0.6347747445106506,0.18409821391105652,5.6670966148376465,26,16.97516441345215,-0.1479617804288864,0.702846884727478,0.18982268869876862,5.4662394523620605,500,18.89791488647461,0.24670402705669403,0.7856683135032654,0.18493321537971497,7.272751808166504,500,24.04220199584961,0.019914034754037857,0.7841129302978516,0.11466450989246368,3.5969300270080566,197,25.081993103027344,0.19039775431156158,0.8243004679679871,0.19027777016162872,10.178629875183105,145,20.056028366088867,-0.3057011365890503,0.8603973984718323,0.2097621113061905,9.087891578674316,500,25.227895736694336,-0.03144318237900734,0.8052465319633484,0.1430753916501999,3.845712900161743,500,21.70534896850586,-0.05035046488046646,0.7276560068130493,0.148365318775177,4.039795398712158,500,18.364177703857422,-2.2224783151614247e-06,0.6325255632400513,0.19385698437690735,2.432619333267212,500,18.738195419311523,-0.033635493367910385,0.6359359622001648,0.22341181337833405,5.412669658660889,400,16.01719856262207,0.0007864768267609179,0.7023988366127014,0.2010006606578827,6.480032920837402,500,22.102428436279297,-0.0012201054487377405,0.759409487247467,0.13381606340408325,2.5473294258117676,142,26.880590438842773,-0.02358187735080719,0.8565123081207275,0.12121907621622086,4.459400177001953,304,16.68733787536621,0.07558043301105499,0.6599492430686951,0.4475219249725342,11.352530479431152,500,19.459217071533203,-0.03683672845363617,0.809934139251709,0.16714370250701904,8.145930290222168,290,18.282310485839844,-0.0003248922876082361,0.6865655779838562,0.20002803206443787,5.662166595458984,500,18.83856964111328,-0.03184810280799866,0.6561446785926819,0.20359492301940918,6.031191349029541,500,25.960927963256836,-0.08454426378011703,0.8430694341659546,0.08546590059995651,6.4937214851379395,500,0,0,0,0,0,0,21.94908332824707,-0.3419593274593353,0.7211637496948242,0.16153308749198914,7.5465989112854,500,20.40264892578125,0.017866995185613632,0.8031715750694275,0.22785907983779907,10.408190727233887,185,24.751502990722656,-0.008820587769150734,0.8268305659294128,0.14042264223098755,2.8674750328063965,307,23.79220199584961,0.023781709372997284,0.9390408396720886,0.13440102338790894,3.189241647720337,200,20.65520668029785,-0.02165459282696247,0.8165206909179688,0.21889762580394745,5.296838283538818,3,23.0324764251709,0.130624458193779,0.6325167417526245,0.1328558623790741,8.84994888305664,58,23.088464736938477,0.06231540068984032,0.6918238997459412,0.16935458779335022,5.561310291290283,500,19.111549377441406,0.09548354148864746,0.6398555040359497,0.23092451691627502,5.997190952301025,136,15.769152641296387,0.028782520443201065,0.6602852940559387,0.26686036586761475,8.060165405273438,475,0,0,0,0,0,0,19.613920211791992,-0.03737274929881096,0.8565442562103271,0.13404110074043274,4.84287166595459,60,0,0,0,0,0,0,0,0,0,0,0,0,16.006196975708008,-0.16971516609191895,0.5292651653289795,0.2796056866645813,10.46243953704834,63,0
+19.012269973754883,-0.04718773066997528,0.6433225870132446,0.18231874704360962,5.107872009277344,500,21.322338104248047,0.0140613978728652,0.6770919561386108,0.16535094380378723,5.265127182006836,500,21.2813777923584,-0.021526703611016273,0.6984917521476746,0.1797674298286438,5.906203269958496,198,19.346487045288086,0.10074374079704285,0.8397706151008606,0.1516176462173462,6.717349529266357,268,20.726825714111328,0.025485752150416374,0.6783157587051392,0.18283355236053467,7.897275447845459,215,14.350356101989746,-0.07720185816287994,0.5780261754989624,0.20371182262897491,5.403542995452881,44,18.631084442138672,0.07769659161567688,0.6344185471534729,0.18611928820610046,5.661814212799072,26,17.056427001953125,-0.1821698099374771,0.7035198211669922,0.18846037983894348,5.462797164916992,500,18.501523971557617,0.08108856528997421,0.7818804383277893,0.19710351526737213,7.370274543762207,500,24.048419952392578,0.013744712807238102,0.7855278849601746,0.11609163880348206,3.6646697521209717,197,24.893909454345703,-0.17591805756092072,0.8268886208534241,0.20372600853443146,10.059407234191895,145,19.865501403808594,-0.22679699957370758,0.8597549796104431,0.2231135070323944,9.073450088500977,500,25.364770889282227,0.010617231950163841,0.8068088889122009,0.14136189222335815,4.029416084289551,500,21.7684383392334,-0.011133561842143536,0.7283687591552734,0.14749296009540558,4.200531482696533,500,18.374786376953125,-0.0007336796843446791,0.6325269341468811,0.1934446394443512,2.4958865642547607,500,18.839426040649414,-0.03957090154290199,0.6344597339630127,0.2224958837032318,5.42381477355957,400,15.805286407470703,0.08770310878753662,0.6977684497833252,0.20588622987270355,6.46218729019165,500,22.08108139038086,-0.01628425344824791,0.7589410543441772,0.13417819142341614,2.693875789642334,142,26.95757484436035,0.0022796352859586477,0.8572258353233337,0.12103652209043503,4.541953086853027,304,16.950056076049805,0.05445929989218712,0.6725068092346191,0.42972901463508606,10.8408203125,500,19.32314682006836,0.034813400357961655,0.810303807258606,0.1743694394826889,8.154923439025879,290,18.345930099487305,0.09873536974191666,0.6867643594741821,0.20015820860862732,5.688977241516113,500,18.867534637451172,0.008000208996236324,0.6561379432678223,0.20259518921375275,6.0598602294921875,500,25.886808395385742,-0.1851750612258911,0.843407154083252,0.08705346286296844,6.513862609863281,500,0,0,0,0,0,0,22.645614624023438,0.03813670575618744,0.7365572452545166,0.14904607832431793,7.366730213165283,500,20.234968185424805,-0.028652122244238853,0.8059276938438416,0.23735074698925018,10.289590835571289,185,24.92990493774414,-0.05894114822149277,0.8283792734146118,0.13925190269947052,3.199906349182129,307,23.42925453186035,-0.41295871138572693,0.9373413920402527,0.14144201576709747,3.7806153297424316,200,20.85039710998535,-0.004692706745117903,0.8196279406547546,0.2168300598859787,5.288863182067871,3,23.50503158569336,-0.028557421639561653,0.6380681991577148,0.1192559152841568,8.550318717956543,58,22.69325828552246,0.08483685553073883,0.6885713338851929,0.17399273812770844,5.622918605804443,500,18.995891571044922,-0.025361914187669754,0.6374244689941406,0.23452074825763702,6.074242115020752,136,15.669060707092285,0.025412963703274727,0.6598301529884338,0.2714788615703583,8.040945053100586,475,0,0,0,0,0,0,19.33572006225586,-0.150547593832016,0.8552142381668091,0.1460384875535965,5.214033126831055,60,0,0,0,0,0,0,0,0,0,0,0,0,16.531322479248047,0.06065846234560013,0.5375595688819885,0.25438711047172546,10.283487319946289,63,0
+18.73299217224121,-0.13161632418632507,0.6400277018547058,0.18755526840686798,5.157333850860596,500,21.10808753967285,-0.16676977276802063,0.672235906124115,0.1692410707473755,5.296654224395752,500,21.203834533691406,-0.024857260286808014,0.6980849504470825,0.18170055747032166,5.922959327697754,198,19.301345825195312,0.2718823254108429,0.8395335078239441,0.15078316628932953,6.726401329040527,268,20.7464542388916,0.0026871892623603344,0.67714923620224,0.18162284791469574,7.944480895996094,215,14.387572288513184,-0.0007205808651633561,0.5734782814979553,0.20236076414585114,5.430235385894775,44,18.500873565673828,0.08837074786424637,0.6331009864807129,0.18800735473632812,5.786571979522705,26,16.948963165283203,-0.07204114645719528,0.7015774846076965,0.19183480739593506,5.483725547790527,500,18.69580078125,0.08595505356788635,0.7832303643226624,0.18746890127658844,7.323951244354248,500,24.063138961791992,0.040227603167295456,0.7842744588851929,0.11548798531293869,3.7416088581085205,197,24.77498435974121,-0.10164796561002731,0.8261294364929199,0.19882343709468842,10.059757232666016,145,20.417898178100586,0.12687283754348755,0.8632937669754028,0.20207682251930237,9.024741172790527,500,25.293771743774414,-0.013352192007005215,0.8060333728790283,0.1428879052400589,3.982311248779297,500,21.808292388916016,0.05234861373901367,0.7285376191139221,0.14711451530456543,4.239038467407227,500,18.38561248779297,0.0016012159176170826,0.6329339742660522,0.19295348227024078,2.502042531967163,500,18.688907623291016,-0.04777387157082558,0.6325020790100098,0.22649399936199188,5.479030609130859,400,16.001020431518555,0.1790591925382614,0.7002289891242981,0.20206555724143982,6.487364292144775,500,22.093610763549805,-0.01092259306460619,0.7591899633407593,0.13419727981090546,2.800780773162842,142,26.88434410095215,0.008505810052156448,0.8564297556877136,0.1211276724934578,4.6540913581848145,304,16.507360458374023,-0.14780215919017792,0.6714658737182617,0.4597805440425873,10.8370361328125,500,19.03635597229004,-0.03866236284375191,0.8065902590751648,0.179178386926651,8.137178421020508,290,18.207599639892578,0.0390840582549572,0.6847269535064697,0.20280848443508148,5.730045318603516,500,18.824954986572266,0.007314752321690321,0.656466543674469,0.20462752878665924,6.068840980529785,500,25.785615921020508,-0.16119396686553955,0.8409033417701721,0.0872565507888794,6.494687557220459,500,0,0,0,0,0,0,22.554359436035156,0.17164531350135803,0.7343575954437256,0.15075889229774475,7.297224998474121,500,20.00868797302246,-0.20269323885440826,0.8015546202659607,0.2370498925447464,10.263086318969727,185,24.849260330200195,0.014324101619422436,0.8274909257888794,0.13913463056087494,3.273496627807617,307,24.143491744995117,-0.0864129289984703,0.9391130208969116,0.1349482536315918,3.8407604694366455,200,20.61045265197754,-0.044369861483573914,0.817896842956543,0.22000306844711304,5.338271617889404,3,23.173229217529297,0.06823521107435226,0.6345419883728027,0.11324688047170639,8.597124099731445,58,22.52924156188965,-0.005226884037256241,0.6790374517440796,0.17646169662475586,5.648611068725586,500,18.822338104248047,-0.034071650356054306,0.6334968209266663,0.23989541828632355,6.15177059173584,136,15.601283073425293,-0.00634151604026556,0.6596918702125549,0.2742881774902344,8.023792266845703,475,0,0,0,0,0,0,19.87816619873047,-0.21906106173992157,0.8562472462654114,0.12401121854782104,5.19991397857666,60,0,0,0,0,0,0,0,0,0,0,0,0,15.993595123291016,-0.15662315487861633,0.5254011154174805,0.27742305397987366,10.258377075195312,63,0
+19.110883712768555,0.03614996746182442,0.6461726427078247,0.1795150190591812,5.265549182891846,500,21.108360290527344,-0.14474032819271088,0.6736648678779602,0.16756995022296906,5.4420576095581055,500,21.234786987304688,-0.0323927141726017,0.6997060179710388,0.18202681839466095,6.203959941864014,198,19.807920455932617,0.17171825468540192,0.842271089553833,0.1452997773885727,6.870168685913086,268,20.6026668548584,-0.011249484494328499,0.6755874752998352,0.18391308188438416,8.184722900390625,215,14.93524169921875,-0.0469074510037899,0.5757077932357788,0.19150333106517792,5.594109058380127,44,18.379505157470703,-0.03754724934697151,0.6315571665763855,0.18700724840164185,5.797399520874023,26,17.05131721496582,-0.10010848939418793,0.7025959491729736,0.18903829157352448,5.659502983093262,500,19.137954711914062,0.14187031984329224,0.785306453704834,0.18198907375335693,7.440811634063721,500,23.754358291625977,0.0686846598982811,0.7810045480728149,0.1172272339463234,3.8019583225250244,197,24.170063018798828,-0.11422554403543472,0.8158525824546814,0.21402356028556824,10.47574234008789,145,20.21315574645996,-0.09578195214271545,0.859663188457489,0.20872971415519714,9.281356811523438,500,25.30156707763672,-0.018742039799690247,0.8054926991462708,0.14185506105422974,4.103004455566406,500,21.711626052856445,0.006813677493482828,0.7276286482810974,0.14892178773880005,4.297058582305908,500,18.365007400512695,-0.007851418107748032,0.6329214572906494,0.19421109557151794,2.7666542530059814,500,18.607521057128906,-0.061320219188928604,0.6336644291877747,0.2250998169183731,5.655640125274658,400,15.967201232910156,-0.06718071550130844,0.6980611681938171,0.20418159663677216,6.667073726654053,500,22.090274810791016,-0.0017378028715029359,0.7593238949775696,0.13377660512924194,2.6701622009277344,142,26.750892639160156,-0.07790957391262054,0.8558348417282104,0.12187864631414413,4.571890354156494,304,16.300031661987305,0.010518108494579792,0.6394075155258179,0.46379354596138,11.451972961425781,500,19.428918838500977,0.1081659346818924,0.8055146336555481,0.16705267131328583,8.28512191772461,290,18.22205924987793,-0.02810261771082878,0.6851233839988708,0.20087064802646637,5.899336814880371,500,18.846923828125,-0.021323775872588158,0.6569703221321106,0.20638112723827362,6.310214519500732,500,25.63957977294922,-0.21977517008781433,0.8407760858535767,0.08965358138084412,6.699525833129883,500,0,0,0,0,0,0,22.031572341918945,-0.16287170350551605,0.7234898805618286,0.15870793163776398,7.465205192565918,500,19.877155303955078,0.0820217877626419,0.7918612360954285,0.24016961455345154,10.721766471862793,185,24.740259170532227,0.05753420665860176,0.8264721035957336,0.14053989946842194,3.2130744457244873,307,23.240985870361328,-0.9365432858467102,0.9365918040275574,0.14060433208942413,3.1881415843963623,200,20.191814422607422,-0.015667753294110298,0.8101682662963867,0.22013281285762787,5.518276214599609,3,22.47603416442871,0.04551320895552635,0.6252691149711609,0.11578691005706787,9.172477722167969,58,22.722797393798828,0.1353231817483902,0.6838358640670776,0.17324240505695343,5.799938678741455,500,18.984954833984375,0.008981794118881226,0.6379337906837463,0.23654848337173462,6.328289031982422,136,15.746525764465332,0.024799227714538574,0.6614393591880798,0.27319034934043884,8.200506210327148,475,0,0,0,0,0,0,19.888586044311523,-0.006714705843478441,0.8541179299354553,0.12033425271511078,5.116232872009277,60,0,0,0,0,0,0,0,0,0,0,0,0,15.9192476272583,-0.10522013157606125,0.5274612903594971,0.29392895102500916,10.56518840789795,63,0
+20.46278190612793,-0.060763970017433167,0.6579670906066895,0.161322221159935,4.680773735046387,200,22.052663803100586,0.5723397135734558,0.6804402470588684,0.1476287543773651,4.742996692657471,200,21.605833053588867,0.23160338401794434,0.6969097852706909,0.171411395072937,5.295865535736084,198,20.8725643157959,0.6837916374206543,0.8545865416526794,0.12993884086608887,6.236001968383789,200,19.573144912719727,0.29120177030563354,0.6459593772888184,0.22614720463752747,9.27774429321289,200,14.133930206298828,0.22245623171329498,0.564282238483429,0.2065575122833252,5.674943447113037,0,18.949764251708984,0.17191997170448303,0.6465899348258972,0.19255967438220978,5.078993797302246,0,17.931673049926758,-0.11049146950244904,0.6982738971710205,0.18147899210453033,5.114457130432129,200,19.872821807861328,3.0028798580169678,0.7939698696136475,0.17656543850898743,6.240903377532959,200,24.46942138671875,0.02408456988632679,0.7884182333946228,0.10927800089120865,2.891601324081421,197,25.840389251708984,0.19366736710071564,0.8314366936683655,0.1776755452156067,9.571022033691406,145,16.929052352905273,-1.5327644348144531,0.7828646302223206,0.3492167592048645,10.210173606872559,200,25.710983276367188,-0.017311187461018562,0.8133223652839661,0.1370222568511963,3.3832216262817383,200,18.50092124938965,-2.022146224975586,0.6764024496078491,0.19759753346443176,4.851006031036377,200,17.02499008178711,0.16653648018836975,0.5973562002182007,0.22383597493171692,3.0246450901031494,200,19.098665237426758,0.004295928869396448,0.638796329498291,0.21742229163646698,5.272642612457275,200,16.54633140563965,0.17737846076488495,0.6975648403167725,0.19392725825309753,5.778793811798096,200,20.060346603393555,0.9734287261962891,0.7293187975883484,0.16429120302200317,3.052821397781372,0,27.030488967895508,-0.08356904238462448,0.8584595322608948,0.12005755305290222,3.672435998916626,0,16.670869827270508,0.08606883883476257,0.6815004348754883,0.4169979691505432,10.184615135192871,0,20.275964736938477,0.1098201647400856,0.8205265402793884,0.15371590852737427,7.1788201332092285,200,19.987890243530273,0.2744450271129608,0.7336406707763672,0.16191206872463226,5.04759407043457,200,17.112150192260742,-0.015052754431962967,0.5950587391853333,0.23803958296775818,6.651583671569824,200,26.6037654876709,-0.26065897941589355,0.8506165146827698,0.08368074893951416,6.122287750244141,200,0,0,0,0,0,0,22.674583435058594,-0.11895836144685745,0.749716579914093,0.14290745556354523,6.852635383605957,200,20.810087203979492,-0.07519318163394928,0.8198818564414978,0.23864984512329102,9.383285522460938,185,25.744356155395508,0.10595372319221497,0.8350962400436401,0.13477018475532532,2.0739593505859375,200,28.547643661499023,0.7507640719413757,0.9549750089645386,0.08899791538715363,1.1312161684036255,200,21.079015731811523,0.029884053394198418,0.8219752907752991,0.21175755560398102,4.344815731048584,3,23.93878173828125,0.05293554812669754,0.6428359746932983,0.08624686300754547,7.878372669219971,58,23.858797073364258,0.06702835857868195,0.6956433653831482,0.15657857060432434,5.089795112609863,200,19.62722396850586,0.05636562407016754,0.650608479976654,0.2117949277162552,5.699061870574951,136,15.717203140258789,0.02669934555888176,0.672042965888977,0.26545801758766174,7.867374420166016,200,0,0,0,0,0,0,21.55535316467285,1.0019360780715942,0.8686189651489258,0.10703979432582855,4.072519302368164,60,0,0,0,0,0,0,0,0,0,0,0,0,16.39645767211914,0.24261629581451416,0.538216233253479,0.2558179199695587,9.674732208251953,63,0
+20.47854995727539,1.2024067640304565,0.6582081317901611,0.15989229083061218,4.665087699890137,200,22.276348114013672,0.9851991534233093,0.6830069422721863,0.14797556400299072,4.454257488250732,200,21.658119201660156,0.8381527066230774,0.6987597942352295,0.16515423357486725,5.077767848968506,198,21.830154418945312,3.4031131267547607,0.8640546202659607,0.12628214061260223,5.637884140014648,200,19.192798614501953,0.3516661822795868,0.6184120774269104,0.2570144832134247,10.193868637084961,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+20.590303421020508,0.4962620437145233,0.6601793169975281,0.15753625333309174,4.5785813331604,200,22.237953186035156,0.4489673674106598,0.6821855902671814,0.14713634550571442,4.530069351196289,200,21.67139434814453,0.5253610610961914,0.6990557909011841,0.16455212235450745,5.083188533782959,198,21.511552810668945,2.3355321884155273,0.8608049750328064,0.12826672196388245,5.940978527069092,200,21.198034286499023,0.4582754373550415,0.696334183216095,0.17922863364219666,7.798947811126709,200,14.655498504638672,0.11240727454423904,0.571294903755188,0.19869443774223328,5.37660551071167,44,18.72746467590332,-0.0767064094543457,0.6407261490821838,0.19800549745559692,5.168641090393066,26,18.191055297851562,0.7278627157211304,0.6975538730621338,0.17866922914981842,5.047889232635498,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+20.47854995727539,1.2024067640304565,0.6582081317901611,0.15989229083061218,4.665087699890137,200,22.276348114013672,0.9851991534233093,0.6830069422721863,0.14797556400299072,4.454257488250732,200,21.658119201660156,0.8381527066230774,0.6987597942352295,0.16515423357486725,5.077767848968506,198,21.830154418945312,3.4031131267547607,0.8640546202659607,0.12628214061260223,5.637884140014648,200,19.192798614501953,0.3516661822795868,0.6184120774269104,0.2570144832134247,10.193868637084961,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+20.5020809173584,0.5488608479499817,0.6597924828529358,0.16010788083076477,4.721373081207275,200,22.316869735717773,1.2297122478485107,0.6829593181610107,0.1500682681798935,4.489542007446289,200,21.685548782348633,0.7039672136306763,0.6994300484657288,0.1671903133392334,5.135034561157227,198,21.7237548828125,2.7444534301757812,0.8628481030464172,0.12802647054195404,5.7939772605896,200,20.3883113861084,0.08283903449773788,0.6582912802696228,0.216960147023201,8.84621810913086,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+21.46029465948103,0.10821035103956703,0.7790344667110257,0.1146072332425551,4.8639076042175295,200,22.858641416969302,0.2752841716840785,0.8049575514436705,0.11705265556546775,4.970765066146851,200,22.984534566644847,0.28235093636870023,0.8254322399853716,0.11391306117594352,5.201311178881713,198,21.269003861197834,0.48774929318372584,0.8845344550020229,0.10633078400722959,5.946489062309265,200,21.91150318956801,-0.1983188362236588,0.8238611486475108,0.14899968794744575,6.809058917893304,144,15.47896710908777,-0.21478780557785693,0.766425387992816,0.16411230564964088,5.260037183761597,32,20.303700892418085,0.11696818304556747,0.7842201829523877,0.14423518538136373,5.439062690734863,20,17.87179956351499,-0.3067176983676024,0.8296448574724792,0.12977059845050629,5.341754336357116,200,19.857372164518466,0.05305542383233112,0.8275884621862206,0.1549614530158314,6.752370271682739,200,27.3605180375065,0.04183501128873296,0.8984342113196049,0.06288346686315807,3.3972074127197267,200,26.833560528931578,0.8061492074839005,0.8918025717551408,0.1506120693911968,9.562130060024604,167,15.137075498804533,-5.621109492690501,0.6937904475067103,0.3866491428085349,10.7611030960083,200,27.60886437063087,-0.5479598479597061,0.9037209833626163,0.08412209230221131,4.320116324424744,200,17.608785845649514,-6.357329803241343,0.6815140742522903,0.24886811768551442,5.626099944114685,200,24.33990679792855,-0.10166818044734177,0.8951419864328773,0.0715118810958864,2.0131524705886843,200,21.411267073256052,-0.021985724226232916,0.7967679150871174,0.13873621311716058,4.87985538482666,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

common/plot/plot_arch_ablation.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+# Sample data based on the provided image structure
+tasks = [ "Add", "Concat", "Cross Attention", "Modulation"]
+values = np.array([
+    [6.35],
+    [5.68],
+    [5.26],
+    [5.02],
+    # [0.87, 0.55, 0.25, 0.03, 0.01, 0.0]
+])
+values = np.exp(values)
+# Bar colors matching the provided image
+bar_colors = ['#1f78b4', '#ffffff', '#a6cee3', '#cab2d6', '#b3b3cc', '#33a02c']
+# Plotting the data
+fig, ax = plt.subplots(figsize=(5, 3))
+# Set bar width and x positions for each group
+bar_width = 0.4
+x = np.arange(len(tasks))
+# Plot each group's bars with the specified colors
+for i in range(values.shape[1]):
+    bars = ax.bar(x + i * bar_width, values[:, i], width=bar_width,  color=bar_colors[i], edgecolor='black')
+for container in ax.containers:
+    ax.bar_label(container, label_type="edge", fontsize="x-large", fmt="%.1f")
+bars[-1].set_color('#cab2d6')
+bars[-1].set_edgecolor('black')
+# Set titles, labels, and ticks
+# ax.set_title("Zero-Shot Performance Comparison Across Tasks")
+ax.set_xlabel("Model", fontsize=14)
+ax.set_ylabel("Perplexity", fontsize=14)
+ax.set_xticks(x )
+ax.tick_params(axis='x', rotation=15)
+ax.set_xticklabels(tasks, fontsize=12)
+ax.set_ylim(values.min() - 10, values.max() + 50)
+# Adding the legend outside the plot area
+# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=3)
+# Display the plot
+plt.tight_layout()
+# plt.show()
+plt.savefig("output/arch_ablation.png", dpi=300)

common/plot/plot_arch_ablation_deltapsnr.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+# Sample data based on the provided image structure
+tasks = [ "Add", "Concat", "Cross Attention", "Modulation"]
+values = np.array([
+    [0.46],
+    [0.18],
+    [0.02],
+    [1.87],
+])
+# Bar colors matching the provided image
+bar_colors = ['#1f78b4', '#a6cee3', '#1f78b4', '#ffffff', '#cab2d6', '#b3b3cc', '#33a02c']
+# Plotting the data
+fig, ax = plt.subplots(figsize=(5, 3))
+# Set bar width and x positions for each group
+bar_width = 0.4
+x = np.arange(len(tasks))
+# Plot each group's bars with the specified colors
+for i in range(values.shape[1]):
+    bars = ax.bar(x + i * bar_width, values[:, i], width=bar_width,  color=bar_colors[i], edgecolor='black')
+for container in ax.containers:
+    ax.bar_label(container, label_type="edge", fontsize="x-large", fmt="%.2f")
+bars[-1].set_color('#cab2d6')
+bars[-1].set_edgecolor('black')
+# Set titles, labels, and ticks
+# ax.set_title("Zero-Shot Performance Comparison Across Tasks")
+ax.set_xlabel("Model", fontsize=14)
+ax.set_ylabel("Delta PSNR", fontsize=14)
+ax.set_xticks(x )
+ax.tick_params(axis='x', rotation=15)
+ax.set_xticklabels(tasks, fontsize=12)
+ax.set_ylim(0, values.max() + 0.2)
+# Adding the legend outside the plot area
+# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=3)
+# Display the plot
+plt.tight_layout()
+# plt.show()
+plt.savefig("output/arch_ablation_controllability.png", dpi=300)

common/plot/plot_dataset_scale.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+# Adjusting the line thickness to better match the provided example
+x = [1, 5, 10]
+tasks = ["Paper Towel Replacement\n(Bi-UR5e)", "Items in Drawer\n(Franka)",
+         "Stack Bowls\n(UR5e)", "Tupperware in Microwave\n(Bi-ARX)"]
+# Define y-values for each line type
+y_values = {
+    "π₀": [0.9, 0.85, 0.8],
+    "π₀ (scratch)": [0.7, 0.75, 0.72],
+    "DP": [0.2, 0.3, 0.4],
+    "Octo": [0.5, 0.6, 0.55],
+    "OpenVLA": [0.1, 0.15, 0.2],
+    "ACT": [0.4, 0.5, 0.6]
+}
+# Define markers, line styles, colors for each line type
+markers = {"π₀": 'o', "π₀ (scratch)": 'o', "DP": 'o', "Octo": 'D', "OpenVLA": '*', "ACT": 'o'}
+styles = {"π₀": '-', "π₀ (scratch)": '--', "DP": '-', "Octo": '-', "OpenVLA": '', "ACT": '-'}
+colors = {"π₀": '#1f78b4', "π₀ (scratch)": '#1f78b4', "DP": '#e31a1c', "Octo": '#33a02c', "OpenVLA": '#6a3d9a', "ACT": '#ff7f00'}
+# Set line width for enhanced visibility
+# Create subplots
+fig, ax = plt.subplots( figsize=(5, 4))
+x_values = [5, 10, 20, 30, 40]
+y_values = [5.94,5.72, 5.21,5.15,5.02]
+y_values = np.exp(y_values)
+# Set line width for each line plot
+line_width = 1.5
+x = []
+# Iterate over each subplot (task) and plot the lines with specified styles, markers, and adjusted line width
+fig, ax1 = plt.subplots(figsize=(5, 4))
+# Plot Perplexity (left y-axis)
+ax1.plot(x_values, y_values, marker='o', linestyle='-', color='#1f78b4', linewidth=line_width)
+ax1.annotate(f"{y_values[-1]:.1f}", (x_values[-1], y_values[-1]), textcoords="offset points", xytext=(0, 10), ha='center')
+ax1.set_xscale('log')
+ax1.set_xlabel("# Dataset", fontsize=14)
+ax1.set_ylabel("Perplexity", fontsize=14, color='#1f78b4')
+ax1.tick_params(axis='y', labelcolor='#1f78b4')
+# Create a twin y-axis for controllability (right y-axis)
+ax2 = ax1.twinx()
+controllability_values = [ 0.46, 0.55, 1.69, 1.5, 1.87]  # Example values for controllability
+ax2.plot(x_values, controllability_values, marker='s', linestyle='--', color='#006400', linewidth=line_width)
+ax2.set_ylabel("Delta PSNR", fontsize=14, color='#006400')
+ax2.set_ylim(0, 2.1)
+ax2.tick_params(axis='y', labelcolor='#006400')
+ax2.annotate(f"{controllability_values[-1]:.1f}", (x_values[-1], controllability_values[-1]), textcoords="offset points", xytext=(0, 10), ha='center')
+# Save the figure in high resolution
+plt.tight_layout()
+# plt.show()
+plt.savefig(f"output/dataset_sizes.png", dpi=300)  # Save the figure in high resolution

common/plot/plot_dataset_traj_scale.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+# Adjusting the line thickness to better match the provided example
+fig, ax = plt.subplots( figsize=(5, 4))
+x_values = [8287, 77664, 532150,1126876,2070965,3163485]
+y_values = [9.46, 6.94, 5.81, 5.70, 5.09, 5.02]
+y_values = np.exp(y_values)
+# Set line width for each line plot
+line_width = 1.5
+x = []
+# Iterate over each subplot (task) and plot the lines with specified styles, markers, and adjusted line width
+# for i, task in enumerate(tasks):
+# Adding a centralized legend that appears above the plot
+# fig.legend(y_values, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, frameon=False, markerscale=1.5)
+fig, ax1 = plt.subplots(figsize=(5, 4))
+# Plot Perplexity (left y-axis)
+ax1.plot(x_values, y_values, marker='o', linestyle='-', color='#1f78b4', linewidth=line_width)
+ax1.annotate(f"{y_values[-1]:.1f}", (x_values[-1], y_values[-1]), textcoords="offset points", xytext=(0, 10), ha='center')
+ax1.set_xscale('log')
+ax1.set_xlabel("# Trajectory", fontsize=14)
+ax1.set_ylabel("Perplexity", fontsize=14, color='#1f78b4')
+ax1.tick_params(axis='y', labelcolor='#1f78b4')
+# Create a twin y-axis for controllability (right y-axis)
+ax2 = ax1.twinx()
+controllability_values = [0.,0.10,1.20,1.41,1.56, 1.87]  # Example values for controllability
+ax2.plot(x_values, controllability_values, marker='s', linestyle='--', color='#006400', linewidth=line_width)
+ax2.set_ylabel("Delta PSNR", fontsize=14, color='#006400')
+ax2.annotate(f"{controllability_values[-1]:.1f}", (x_values[-1], controllability_values[-1]), textcoords="offset points", xytext=(0, 10), ha='center')
+ax2.set_ylim(0, 2.1)
+ax2.tick_params(axis='y', labelcolor='#006400')
+# Save the figure in high resolution
+plt.tight_layout()
+#plt.show()
+plt.savefig(f"output/traj_sizes.png", dpi=300)

common/plot/plot_dynamics_ablation.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+# Sample data based on the provided image structure
+tasks = ["Passive Dynamics", "Full Dynamics", "Forward Dynamics"]
+bar_labels = ["Passive Dynamics", "Full Dynamics", "Forward Dynamics"]
+values = np.array([
+    [6.29],
+    [5.21],
+    [5.02],
+])
+values = np.exp(values)
+# Bar colors matching the provided image
+bar_colors = ['#a6cee3', '#ffffff', '#a6cee3', '#cab2d6', '#b3b3cc', '#33a02c']
+# Plotting the data
+fig, ax = plt.subplots(figsize=(5, 3))
+# Set bar width and x positions for each group
+bar_width = 0.4
+x = np.arange(len(tasks))
+# Plot each group's bars with the specified colors
+for i in range(values.shape[1]):
+    bars = ax.bar(x + i * bar_width, values[:, i], width=bar_width,  color=bar_colors[i], edgecolor='black')
+bars[-1].set_color('#cab2d6')
+bars[-1].set_edgecolor('black')
+for container in ax.containers:
+    ax.bar_label(container, label_type="edge", fontsize="x-large", fmt="%.1f")
+# Set titles, labels, and ticks
+# ax.set_title("Zero-Shot Performance Comparison Across Tasks")
+ax.set_xlabel("Model", fontsize=14)
+ax.set_ylabel("Perplexity", fontsize=14)
+ax.set_xticks(x )
+ax.set_xticklabels(tasks, fontsize=12)
+ax.set_ylim(values.min() - 10, values.max() + 50)
+ax.tick_params(axis='x', rotation=15)
+# Adding the legend outside the plot area
+# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=3)
+# Display the plot
+plt.tight_layout()
+# plt.show()
+plt.savefig("output/dynamics_ablation.png", dpi=300)

common/plot/plot_dynamics_ablation_deltapsnr.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import numpy as np
+# Sample data based on the provided image structure
+tasks = ["Passive Dynamics", "Full Dynamics", "Forward Dynamics"]
+bar_labels = ["Passive Dynamics", "Full Dynamics", "Forward Dynamics"]
+values = np.array([
+    [0.33],
+    [1.23],
+    [1.87],
+    # [0.87, 0.55, 0.25, 0.03, 0.01, 0.0]
+])
+# Bar colors matching the provided image
+bar_colors = ['#a6cee3', '#1f78b4', '#ffffff', '#cab2d6', '#b3b3cc', '#33a02c']
+# Plotting the data
+fig, ax = plt.subplots(figsize=(5, 3))
+# Set bar width and x positions for each group
+bar_width = 0.4
+x = np.arange(len(tasks))
+# Plot each group's bars with the specified colors
+for i in range(values.shape[1]):
+    bars = ax.bar(x + i * bar_width, values[:, i], width=bar_width,  color=bar_colors[i], edgecolor='black')
+bars[-1].set_color('#cab2d6')
+bars[-1].set_edgecolor('black')
+for container in ax.containers:
+    ax.bar_label(container, label_type="edge", fontsize="x-large", fmt="%.2f")
+# Set titles, labels, and ticks
+# ax.set_title("Zero-Shot Performance Comparison Across Tasks")
+ax.set_xlabel("Model", fontsize=14)
+ax.set_ylabel("Delta PSNR", fontsize=14)
+ax.set_xticks(x )
+ax.set_xticklabels(tasks, fontsize=12)
+ax.set_ylim(values.min() - 0.1, values.max() + 0.2)
+ax.tick_params(axis='x', rotation=15)
+# Adding the legend outside the plot area
+# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=3)
+# Display the plot
+plt.tight_layout()
+# plt.show()
+plt.savefig("output/dynamics_ablation_controllability.png", dpi=300)

common/plot/plot_from_wandb.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import wandb
+import pandas as pd
+import matplotlib.pyplot as plt
+import sys
+import argparse
+import os
+"""
+Running plotting scripts over key metrics and key runs
+export MODEL=final2_40dataset_waction_concat_gpu_8_nodes_1
+python common/plot/plot_from_wandb.py  --run_id $MODEL
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_noaction_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_modulate_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_attn_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_add_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_d64_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_forward_dynamics_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_full_dynamics_gpu_8_nodes_1_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_traj100000_gpu_8_nodes_1_68536steps_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_traj10000_gpu_8_nodes_1_68536steps_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_traj100_gpu_8_nodes_1_68536steps_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_40dataset_waction_traj1000_gpu_8_nodes_1_68536steps_step15k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_5dataset_waction_gpu_8_nodes_1_step24k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_30dataset_waction_gpu_8_nodes_1_step24k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_5dataset_waction_gpu_8_nodes_1_step24k_v5
+python common/plot/plot_from_wandb.py  --run_id final2_10dataset_waction_gpu_8_nodes_1_step24k_v5
+"""
+# Initialize the wandb API client
+api = wandb.Api()
+pwd = os.path.dirname(os.path.abspath(__file__))
+# Replace with your specific project and entity
+entity = "latent-mage"
+project = "video_val"
+# List of datasets to process
+datasets = [
+    "bridge_data_v2",
+    "fractal20220817_data",
+    "language_table",
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds",
+    "kaist_nonprehensile_converted_externally_to_rlds",
+    "ucsd_kitchen_dataset_converted_externally_to_rlds",
+    "utokyo_xarm_bimanual_converted_externally_to_rlds",
+    "stanford_hydra_dataset_converted_externally_to_rlds",
+    "austin_sirius_dataset_converted_externally_to_rlds",
+    "berkeley_fanuc_manipulation",
+    "berkeley_mvp_converted_externally_to_rlds",
+    "berkeley_rpt_converted_externally_to_rlds",
+    "cmu_play_fusion",
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds",
+    "qut_dexterous_manpulation",
+    "robo_net",
+    "furniture_bench_dataset_converted_externally_to_rlds",
+    "dlr_sara_grid_clamp_converted_externally_to_rlds",
+    "cmu_stretch",
+    "spoc",
+    "columbia_cairlab_pusht_real",
+    "droid",
+    "toto",
+    "io_ai_tech",
+    "conq_hose_manipulation",
+    "dobbe",
+    "berkeley_gnm_cory_hall",
+    "plex_robosuite",
+    "usc_cloth_sim_converted_externally_to_rlds",
+    "berkeley_cable_routing",
+    "imperial_wrist_dataset",
+    "bc_z",
+    "kuka",
+    "roboturk",
+    "metaworld",
+    "robomimic",
+    "epic_kitchen",
+    "ego4d",
+    "nyu_door_opening_surprising_effectiveness"
+]
+def normalize_dataset(metric, runs):
+    """
+    Figure out best and worst values for a metric across all runs
+    and use it for normalization
+    """
+    pass
+# List to store dataframes of PSNR metrics for each dataset
+metrics_data = []
+# Get runs based on a path
+# Set up argument parser
+parser = argparse.ArgumentParser(description='Process some integers.')
+parser.add_argument('--run_id', type=str, default='40dataset_waction_add_gpu_8_nodes_1', help='The run ID to process')
+# Parse arguments
+args = parser.parse_args()
+fields = ['num_examples', 'teacher_force_psnr', 'teacher_force_psnr_delta', 'teacher_force_ssim', 'teacher_force_pred_lpips', 'teacher_force_loss']
+num_fields = len(fields)
+run_id = args.run_id
+runs_path = f"{entity}/{project}/runs"
+run = api.run(f"{entity}/{project}/runs/{run_id}")
+# Get the history dataframe of a run
+history = run.history(pandas=True)
+model_step = 0
+summary_metrics = run.summary
+num_datasets = 0
+# output the field into csv
+# csv_output = f"{pwd}/aggregated_output.csv"
+csv_output = f"aggregated_output.csv"
+# initialize the csv file
+if not os.path.exists(csv_output):
+    with open(csv_output, 'w') as f:
+        field_str = f"name,"
+        for dataset in datasets:
+            for field in fields:
+                field_str += f"{dataset}/{field},"
+        f.write(field_str.rstrip(",") + "\n")
+results = [run_id] + [None] * len(datasets) * num_fields
+for field_idx, field in enumerate(fields):
+    if not history.empty:
+        # Filter the history to only include PSNR metrics for the specified datasets
+        for dataset_idx, dataset in enumerate(datasets):
+            field_col = f"{dataset}/{field}"
+            col_idx = dataset_idx * num_fields + field_idx + 1
+            if field == "num_examples":
+                if f"{dataset}/num_examples" in summary_metrics:
+                    results[col_idx] = summary_metrics[f"{dataset}/num_examples"]
+                continue
+            if field_col in history.columns:
+                # Calculate PSNR divided by the number of examples (uncomment if needed)
+                # history[field_col] = history[field_col] / history.shape[0]
+                valid_field = history[field_col].dropna()
+                if not valid_field.empty:
+                    last_valid_value = valid_field.iloc[-1]  # Get the last non-NaN value
+                    num_datasets += 1
+                    metrics = pd.DataFrame({field_col: [last_valid_value]})
+                    metrics['dataset'] = dataset
+                    results[col_idx] = last_valid_value
+                    metrics_data.append(metrics)
+            else:
+                pass
+                # print("missing dataset:", dataset)
+            if f"{dataset}/model_step" in summary_metrics:
+                model_step = summary_metrics[f"{dataset}/model_step"]
+    # Combine all the metric dataframes into one
+    if metrics_data:
+        all_metrics_df = pd.concat(metrics_data, ignore_index=True)
+        # # Compute aggregated statistics (mean, median, std, etc.) for PSNR
+        # aggregated_stats = all_metrics_df.groupby('dataset').mean()
+        #
+        # # Plot the mean PSNR for each dataset
+        # plt.figure(figsize=(12, 8))
+        # aggregated_stats[f'{field}'] = aggregated_stats.mean(axis=1)
+        # aggregated_stats[f'{field}'].plot(kind='bar')
+        # # print number of steps in the wandb run
+        # print(f"run: {run_id} field: {field} steps: {model_step} num of dataset: {len(metrics_data)}")
+        # print(f"{field}: {aggregated_stats[field].mean():.2f}+-{aggregated_stats[field].std():.2f}", )
+        #
+        # plt.title(f"Mean {field} for Each Dataset")
+        # plt.xlabel("Dataset")
+        # plt.ylabel(f"Mean {field} ")
+        # plt.xticks(rotation=90)
+        # plt.tight_layout()
+        #
+        # # Save the plot
+        # plt.savefig(f"{pwd}/output/{run.id}_{field}_plot.png")
+# write the results into csv
+with open(csv_output, 'a+') as f:
+    f.write(",".join([str(x) for x in results]) + "\n")
+# Display aggregated statistics
+# print(aggregated_stats)
+# Save the aggregated statistics as a CSV if needed
+# aggregated_stats.to_csv(f"{run_id}_{field}_stat.csv", index=True)

common/plot/plot_from_wandb_singledataset.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import wandb
+import pandas as pd
+import matplotlib.pyplot as plt
+import sys
+import argparse
+"""
+Running plotting scripts over key metrics and key runs
+export MODEL=40dataset_waction_add_gpu_8_nodes_1
+python common/plot/plot_from_wandb.py --field teacher_force_psnr --run_id $MODEL
+python common/plot/plot_from_wandb.py --field teacher_force_psnr_delta --run_id $MODEL
+python common/plot/plot_from_wandb.py --field teacher_force_ssim --run_id $MODEL
+python common/plot/plot_from_wandb.py --field teacher_force_pred_lpips --run_id $MODEL
+python common/plot/plot_from_wandb.py --field teacher_force_loss --run_id $MODEL
+"""
+# Initialize the wandb API client
+api = wandb.Api()
+# Replace with your specific project and entity
+entity = "latent-mage"
+project = "video_val"
+# List of datasets to process
+datasets = [
+    "bridge_data_v2",
+    "fractal20220817_data",
+    "language_table",
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds",
+    "kaist_nonprehensile_converted_externally_to_rlds",
+    "ucsd_kitchen_dataset_converted_externally_to_rlds",
+    "utokyo_xarm_bimanual_converted_externally_to_rlds",
+    "stanford_hydra_dataset_converted_externally_to_rlds",
+    "austin_sirius_dataset_converted_externally_to_rlds",
+    "berkeley_fanuc_manipulation",
+    "berkeley_mvp_converted_externally_to_rlds",
+    "berkeley_rpt_converted_externally_to_rlds",
+    "cmu_play_fusion",
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds",
+    "qut_dexterous_manpulation",
+    "robo_net",
+    "furniture_bench_dataset_converted_externally_to_rlds",
+    "dlr_sara_grid_clamp_converted_externally_to_rlds",
+    "cmu_stretch",
+    "spoc",
+    "columbia_cairlab_pusht_real",
+    "droid",
+    "toto",
+    "io_ai_tech",
+    "conq_hose_manipulation",
+    "dobbe",
+    "berkeley_gnm_cory_hall",
+    "plex_robosuite",
+    "usc_cloth_sim_converted_externally_to_rlds",
+    "berkeley_cable_routing",
+    "imperial_wrist_dataset",
+    "bc_z",
+    "kuka",
+    "roboturk",
+    "metaworld",
+    "robomimic",
+    "epic_kitchen",
+    "ego4d",
+    "nyu_door_opening_surprising_effectiveness"
+]
+# List to store dataframes of PSNR metrics for each dataset
+# Get runs based on a path
+# Set up argument parser
+parser = argparse.ArgumentParser(description='Process some integers.')
+parser.add_argument('--field', type=str, default='teacher_force_psnr', help='The field to process')
+parser.add_argument('--run_id', type=str, default='40dataset_waction_add_gpu_8_nodes_1', help='The run ID to process')
+# Parse arguments
+args = parser.parse_args()
+field = args.field
+run_id = args.run_id
+runs_path = f"{entity}/{project}/runs"
+run = api.run(f"{entity}/{project}/runs/{run_id}")
+# Get the history dataframe of a run
+history = run.history(pandas=True)
+model_step = 0
+summary_metrics = run.summary
+num_datasets = 0
+fields = ['num_examples', 'teacher_force_psnr', 'teacher_force_psnr_delta', 'teacher_force_ssim', 'teacher_force_pred_lpips', 'teacher_force_loss']
+for field in fields:
+    metrics_data = []
+    if not history.empty:
+        # Filter the history to only include PSNR metrics for the specified datasets
+        for dataset in datasets:
+            field_col = f"{dataset}/{field}"
+            step_col = f"{dataset}/model_step"
+            if field_col in history.columns:
+                # Calculate PSNR divided by the number of examples (uncomment if needed)
+                # history[field_col] = history[field_col] / history.shape[0]
+                valid_field = history[field_col].dropna()
+                if not valid_field.empty:
+                    last_valid_value = valid_field.iloc[-1]  # Get the last non-NaN value
+                    num_datasets += 1
+                    metrics = pd.DataFrame({field_col: [last_valid_value]})
+                    metrics['dataset'] = dataset
+                    metrics_data.append(metrics)
+            if step_col in summary_metrics:
+                model_step = summary_metrics[step_col]
+    # Combine all the metric dataframes into one
+    if metrics_data:
+        all_metrics_df = pd.concat(metrics_data, ignore_index=True)
+        # Print columns for debugging
+        # Compute aggregated statistics (mean, median, std, etc.) for PSNR
+        aggregated_stats = all_metrics_df.groupby('dataset').mean()
+        # Plot the mean PSNR for each dataset
+        plt.figure(figsize=(12, 8))
+        aggregated_stats[f'{field}'] = aggregated_stats.mean(axis=1)
+        aggregated_stats[f'{field}'].plot(kind='bar')
+        # print number of steps in the wandb run
+        print(f"run: {run_id} field: {field} steps: {model_step} num of dataset: {len(metrics_data)}")
+        print(f"{field}: {aggregated_stats[field].mean():.2f}+-{aggregated_stats[field].std():.2f}", )
+        # plt.title(f"Mean {field} for Each Dataset")
+        # plt.xlabel("Dataset")
+        # plt.ylabel(f"Mean {field} ")
+        # plt.xticks(rotation=90)
+        # plt.tight_layout()
+        # # Save the plot
+        # import os
+        # pwd = os.path.dirname(os.path.abspath(__file__))
+        # plt.savefig(f"{pwd}/output/{run.id}_{field}_plot.png")
+# Display aggregated statistics
+# print(aggregated_stats)
+# Save the aggregated statistics as a CSV if needed
+# aggregated_stats.to_csv(f"{run_id}_{field}_stat.csv", index=True)

common/plot/plot_model_scale.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+# Adjusting the line thickness to better match the provided example
+fig, ax = plt.subplots( figsize=(5, 4))
+# 64, 64sqrt2, 128, 128sqrt2, 256, 256sqrt2, 512
+x_values = [2.8, 10.2, 36.9, 174.22, 366.9, 755.9] #
+y_values = [6.33, 5.52, 5.25, 5.19, 5.02, 5.02] # , 5.09
+y_values = np.exp(y_values)
+# 256sqrt2->700m
+# 512->1.3billion
+# Set line width for each line plot
+line_width = 1.5
+x = []
+# Iterate over each subplot (task) and plot the lines with specified styles, markers, and adjusted line width
+# for i, task in enumerate(tasks):
+# ax.plot(x_values, y_values,  marker='o', linestyle='--', color='#1f78b4', linewidth=line_width)
+# # for i, txt in enumerate(y_values):
+# #     ax.annotate(f"{txt:.1f}", (x_values[i], y_values[i]), textcoords="offset points", xytext=(0,10), ha='center')
+# ax.annotate(f"{y_values[-1]:.1f}", (x_values[-1], y_values[-1]), textcoords="offset points", xytext=(0,10), ha='center')
+# # Set individual titles and axis labels for each subplot
+# ax.set_xlabel("Model Parameters(M)", fontsize=14)
+# ax.set_ylabel("Perplexity", fontsize=14)
+# ax.set_ylim(0, 1)
+fig, ax1 = plt.subplots(figsize=(5, 4))
+INDEX = -2
+# Plot Perplexity (left y-axis)
+ax1.plot(x_values, y_values, marker='o', linestyle='-', color='#1f78b4', linewidth=line_width)
+ax1.annotate(f"{y_values[INDEX]:.1f}", (x_values[INDEX], y_values[INDEX]), textcoords="offset points", xytext=(0, 10), ha='center')
+ax1.set_xscale('log')
+ax1.set_xlabel("Model Parameters(M)", fontsize=14)
+ax1.set_ylabel("Perplexity", fontsize=14, color='#1f78b4')
+ax1.tick_params(axis='y', labelcolor='#1f78b4')
+# , 1.18
+# Create a twin y-axis for controllability (right y-axis)
+ax2 = ax1.twinx()
+controllability_values = [0.11, 1.02, 1.07, 1.12, 1.87, 1.34]  #  Example values for controllability
+ax2.plot(x_values, controllability_values, marker='s', linestyle='--', color='#006400', linewidth=line_width)
+ax2.set_ylabel("Delta PSNR", fontsize=14, color='#006400')
+ax2.set_ylim(0, np.max(controllability_values) + 0.2)
+ax2.tick_params(axis='y', labelcolor='#006400')
+ax2.annotate(f"{controllability_values[INDEX]:.1f}", (x_values[INDEX], controllability_values[INDEX]), textcoords="offset points", xytext=(0, 10), ha='center')
+# Save the figure in high resolution
+plt.tight_layout()
+# plt.show()
+plt.savefig(f"output/model_sizes.png", dpi=300)
+# Adding a centralized legend that appears above the plot
+# fig.legend(y_values, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, frameon=False, markerscale=1.5)

common/plot/plot_pretrain_ablation.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+font = {
+    "family": "normal",
+    "size": 22,
+}
+matplotlib.rc("font", **font)
+sns.set(rc={"font.family": "Times New Roman"})
+sns.set(style="whitegrid")
+sns.set(font_scale=3, style="whitegrid")
+# Sample data for plotting
+categories = ["Scratch", "Passive Pre-Train", "Pre-Train", "Pre-Train (Large)"]
+values = [1.0, 1.0, 1.0, 1.0]
+# Define custom colors for the bars
+colors = ["#4c72b0", "#55a868", "#c44e52", "#8172b2"]  # Adjust as needed
+plt.figure(figsize=(14, 12))
+ax = sns.barplot(
+    x=categories, y=values, alpha=0.9, palette=colors, edgecolor="black"
+)
+for container in ax.containers:
+    ax.bar_label(container, label_type="edge", fontsize="x-large", fmt="%.2f")
+# Adding title and labels
+plt.xlabel("Setting", fontsize=40)
+plt.ylabel("Validation Perplexity", fontsize=40)
+plt.xticks(fontsize=30)
+plt.yticks(fontsize=30)
+plt.legend(fontsize="small", title_fontsize="small", loc="lower left")
+# Remove the borders
+sns.despine(left=True, bottom=True)
+# Display the plot
+plt.tight_layout()
+plt.savefig(f"output/model_ablation.png", dpi=300)  # Save the figure in high resolution
+plt.show()

common/plot/plot_pretrain_ablation_mar.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import seaborn as sns
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+font = {
+    "family": "normal",
+    "size": 22,
+}
+matplotlib.rc("font", **font)
+sns.set(rc={"font.family": "Times New Roman"})
+sns.set(style="whitegrid")
+sns.set(font_scale=3, style="whitegrid")
+# Sample data for plotting
+categories = ["Scratch", "Passive Pre-Train", "Pre-Train", "Pre-Train (Large)"]
+values = [1.0, 1.0, 1.0, 1.0]
+# Define custom colors for the bars
+colors = ["#4c72b0", "#55a868", "#c44e52", "#8172b2"]  # Adjust as needed
+plt.figure(figsize=(14, 12))
+ax = sns.barplot(
+    x=categories, y=values, alpha=0.9, palette=colors, edgecolor="black"
+)
+for container in ax.containers:
+    ax.bar_label(container, label_type="edge", fontsize="x-large", fmt="%.2f")
+# Adding title and labels
+plt.xlabel("Setting", fontsize=40)
+plt.ylabel("Validation Perplexity", fontsize=40)
+plt.xticks(fontsize=30)
+ax.tick_params(axis='x', rotation=15)
+plt.yticks(fontsize=30)
+plt.legend(fontsize="small", title_fontsize="small", loc="lower left")
+# Remove the borders
+sns.despine(left=True, bottom=True)
+# Display the plot
+plt.tight_layout()
+plt.savefig(f"output/model_ablation.png", dpi=300)  # Save the figure in high resolution
+plt.show()

cont_data.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import json
+import math
+import os
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from einops import rearrange
+from torch.utils.data import Dataset as TorchDataset
+from datasets.encode_openx_dataset import DATA_FREQ_TABLE
+from genie.config import GenieConfig
+from genie.st_mask_git import cosine_schedule
+SVD_SCALE = 0.18215
+def normalize_actions(actions):
+    """
+    compute mean and std of actions. Normalize actions is done inside the network.
+    """
+    mean = np.mean(actions, axis=0).tolist()
+    std = np.std(actions, axis=0).tolist()
+    return actions, [mean, std]
+class RawFeatureDataset(TorchDataset):
+    """ Loads raw float32 tokens as memmap-backed array """
+    def __init__(
+        self,
+        data_dir,
+        window_size,
+        stride=1,
+        filter_interrupts=True,
+        filter_overlaps=False,
+        use_actions=False,
+        max_traj_num=1000000,
+        compute_stride_from_freq_table=True,
+        natural_hz=2,
+        datio_noise_ratio=0.0,
+        use_raw_image_as_latent=False,
+        domain=None,
+    ):
+        """
+        Args:
+            data_dir: directory with the same format as `data/train_v0` and `data/val_v0`.
+                Notably, has `video.bin` and `metadata.json`
+            window_size: number of frames per "video" sequence
+            stride: frame skip
+            filter_interrupts: Under 3% of training frame sequences are the concatenation of two different clips.
+                If filter_interrupts is True, will filter out these sequences using the segment ids.
+            filter_overlaps: If False (default), one frame will appear in multiple examples;
+                e.g. frame 0 might appear as the first frame in example 0 and also the second frame in example 15.
+                If True, will filter out examples so that each frame appears at most once in the dataset.
+            use_actions: If True, will load the actions from the `actions` folder for the models
+        """
+        data_dir = Path(data_dir)
+        with open(data_dir / "metadata.json") as f:
+            self.metadata = json.load(f)
+        # TODO: assert not quantized in metadata
+        shape = (self.metadata["num_images"], self.metadata.get("latent_channels", 4), self.metadata["h"], self.metadata["w"]) #
+        print("token shape:", shape)
+        self.use_raw_image_as_latent = use_raw_image_as_latent
+        if use_raw_image_as_latent:
+            shape = (shape[0], 3, shape[2], shape[3])
+            # resize to 32x32
+        video_tokens_path, segment_ids_path, action_tokens_path = [data_dir / f"{name}.bin"
+                                                                   for name in ["video", "segment_ids", "actions"]]
+        token_dtype = np.dtype(self.metadata.get("token_dtype", "float16"))
+        self.data = np.memmap(video_tokens_path, mode="r", shape=shape, dtype=token_dtype)
+        print("data nan:", torch.isnan(torch.from_numpy(self.data[:100].copy())).sum())
+        # import IPython; IPython.embed()
+        if use_raw_image_as_latent:
+            # debug for robomimic dataset
+            # 256->64x64
+            self.metadata["h"] = 32
+            self.metadata["w"] = 32
+            self.metadata["latent_channels"] = 3
+        self.window_size, self.stride = window_size, stride
+        self.datio_noise_ratio = datio_noise_ratio
+        if domain is not None:  # TODO: remove
+            self.name = domain
+        else:
+            self.name = self.metadata["name"]
+        self.name = self.name.replace("_noquant", "")
+        self.stride = stride
+        if compute_stride_from_freq_table:
+            self.stride = max(DATA_FREQ_TABLE.get(self.name, 1) // natural_hz, 1)
+        self.n_action = self.metadata.get("action_dim", 1) * (self.stride)
+        if use_actions:
+            actions = []
+            # hack here for the separations in the 1x datasets
+            for action_file in sorted((data_dir / "actions").iterdir()):
+                actions.append(np.memmap(action_file, dtype=np.float32, mode="r").reshape(len(self.data), -1))
+            self.actions = np.concatenate(actions, axis=-1)
+            self.actions, self.action_stat = normalize_actions(self.actions)
+        if os.path.isfile(segment_ids_path):
+            self.segment_ids = np.memmap(
+                segment_ids_path,
+                dtype=np.int32,
+                mode="r",
+                shape=(self.metadata["num_images"],)
+            )
+        else:
+            self.segment_ids = None
+            if filter_interrupts:
+                raise NotImplementedError("Cannot filter interrupted sequences without segment ids.")
+        # Number of frames between the first and last frames of a video sequence (excluding one endpoint frame)
+        self.video_len = (self.window_size - 1) * self.stride
+        self.valid_start_inds = []
+        for start_ind in range(len(self.data) - self.video_len - self.stride):
+            # Assuming `segment_ids` is monotonically increasing, a sequence is interrupted (or too short)
+            # if the first and last frames have different segment ids.
+            if not (filter_interrupts and self.segment_ids[start_ind] != self.segment_ids[start_ind + self.video_len]):
+                self.valid_start_inds.append(start_ind)
+            if len(self.valid_start_inds) >= max_traj_num:
+                break
+        if filter_overlaps:
+            # Instead of using a sliding window, use each frame at most once
+            filtered_start_inds = []
+            for start_ind in self.valid_start_inds:
+                overlapping_start_inds = {start_ind - i * self.stride for i in range(1, self.window_size)}
+                # all sequences from `overlapping_start_inds` will also contain `start_ind`,
+                # so exclude sequence starting from `start_ind` if any of `overlapping_start_inds` is already being used
+                for existing_start_ind in filtered_start_inds[-self.window_size * self.stride:]:
+                    # Bound could be improved
+                    if existing_start_ind in overlapping_start_inds:
+                        break
+                else:
+                    filtered_start_inds.append(start_ind)
+            self.valid_start_inds = filtered_start_inds
+        num_videos = len(np.unique(self.segment_ids))
+        print(f"Loaded {len(self)} sequences from {data_dir} {self.stride=} {self.window_size=} {self.n_action=} {num_videos=}")
+    def __len__(self):
+        return len(self.valid_start_inds)
+    def __getitem__(self, idx):
+        """
+        Returns a flattened sequence of tokens representing `self.window_size` frames,
+        spaced `self.stride` apart.
+        """
+        start_ind = self.valid_start_inds[idx]
+        x = self.data[start_ind : start_ind + self.video_len + 1 : self.stride].copy()
+        x = torch.FloatTensor(x).float()
+        if self.use_raw_image_as_latent:
+            x = torch.nn.functional.interpolate(x, size=(self.metadata["h"], self.metadata["w"]))
+            # normalize
+            x = x / 255 - 0.5
+        else:
+            x = x * SVD_SCALE
+        x = rearrange(x, "t c h w -> (t h w) c")
+         # divide it when decoding
+        # reconstructions since the input ids and the labels are the same
+        attention_mask = torch.ones_like(x)
+        data_dict = {
+            "input_ids": x,
+            "labels": x,
+            "attention_mask": attention_mask,
+            "h": self.metadata["h"],
+            "w": self.metadata["w"],
+            "c": self.metadata["latent_channels"],
+        }
+        if hasattr(self, "actions"):
+            # we want to have all actions within the stride to predict the next frame at the end of the stride
+            # we will concatenate the actions from [window_size, d_action] to [window_size, d_action * stride]
+            data_dict['action_ids'] = self.actions[start_ind:start_ind + self.video_len + self.stride].reshape(self.window_size, -1)
+            data_dict['action_ids'] = torch.from_numpy(data_dict['action_ids'].astype(np.float32))
+        data_dict["domain"] = self.name.replace("_noquant", "")
+        return data_dict
+def get_maskgit_collator_feature(config: GenieConfig):
+    # mask_token_id = config.image_vocab_size
+    def collate_fn(features) -> dict[str, torch.Tensor]:
+        # during training, map (z_0, z_1', z_2') -> (null, z_1, z_2)
+        # (z_0, z_1') -> (null, z_1) is the diffusion operator on z_1' -> z_1
+        h = features[0]["h"]
+        w = features[0]["w"]
+        input_ids = torch.stack([ex["input_ids"] for ex in features])
+        device = input_ids.device
+        x_THWC = rearrange(input_ids, "b (t h w) c -> b t h w c", b=len(features), t=config.T, h=h, w=w)
+        labels = x_THWC.clone()
+        first_masked_frame = config.T
+        mask = torch.zeros(1).long()
+        mask_token_indicator = torch.zeros((len(features), config.T, h, w)).long()
+        if config.dataloader_apply_mask:
+            if random.random() < config.non_mlm_ratio:  # Closer to autoregressive inference
+                # Leave frames [0, first_masked_frame) unmasked.
+                first_masked_frame = random.randint(config.num_prompt_frames, config.T - 1)
+            else:  # Typical MLM masking
+                first_masked_frame = 1
+            c = 0
+            while mask.max() == 0:  # We could get unlucky and mask no tokens?
+                # per-minibatch, per-frame masking probability (could try variable masking rate from MUSE)
+                rand = torch.rand(len(features), config.T - first_masked_frame, 1, 1)
+                # add a minimum mask ratio
+                rand_mask = rand * (1 - config.dataloader_mask_ratio_min) + config.dataloader_mask_ratio_min
+                mask_prob_T = cosine_schedule(rand_mask)
+                r = torch.rand_like(x_THWC[:, first_masked_frame:, ..., 0], dtype=torch.float)
+                mask = r < mask_prob_T
+                c += 1
+            if c > 1:
+                print(f"Generated mask {c} > 1 times.")
+            mask_token_indicator = torch.cat([
+                torch.zeros((len(features), first_masked_frame, h, w), dtype=mask.dtype), mask], dim=1)
+        data_dict = {
+            "input_ids": rearrange(x_THWC, "b t h w c -> b (t h w) c"),
+            "labels": rearrange(labels, "b t h w c-> b (t h w) c"),
+            "masked_tokens_indicator": mask_token_indicator,
+        }
+        if "action_ids" in features[0]:
+            data_dict['action_ids'] = torch.stack([ex["action_ids"] for ex in features])
+        data_dict['domain'] = [ex["domain"] for ex in features]
+        data_dict['h'] = [ex["h"] for ex in features]
+        data_dict['w'] = [ex["w"] for ex in features]
+        return data_dict
+    return collate_fn

data.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import json
+import math
+import os
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from einops import rearrange
+from torch.utils.data import Dataset as TorchDataset
+from datasets.encode_openx_dataset import DATA_FREQ_TABLE
+from genie.factorization_utils import factorize_token_ids, unfactorize_token_ids
+from genie.config import GenieConfig
+from genie.st_mask_git import cosine_schedule
+def normalize_actions(actions: np.ndarray) -> tuple[np.ndarray, list[list[float]]]:
+    """
+    compute mean and std of actions. Normalize actions is done inside the network.
+    """
+    mean = np.mean(actions, axis=0).tolist()
+    std = np.std(actions, axis=0).tolist()
+    return actions, [mean, std]
+class RawTokenDataset(TorchDataset):
+    """ Loads raw uint32 tokens as memmap-backed array """
+    def __init__(
+        self,
+        data_dir,
+        window_size,
+        stride=1,
+        filter_interrupts=True,
+        filter_overlaps=False,
+        use_actions=False,
+        name='',
+        max_traj_num=1000000,
+        compute_stride_from_freq_table=True,
+        natural_hz=2,
+        drop_action_ratio=0.0
+    ):
+        """
+        Args:
+            data_dir: directory with the same format as `data/train_v0` and `data/val_v0`.
+                Notably, has `video.bin` and `metadata.json`
+            window_size: number of frames per "video" sequence
+            stride: frame skip
+            filter_interrupts: Under 3% of training frame sequences are the concatenation of two different clips.
+                If filter_interrupts is True, will filter out these sequences using the segment ids.
+            filter_overlaps: If False (default), one frame will appear in multiple examples;
+                e.g. frame 0 might appear as the first frame in example 0 and also the second frame in example 15.
+                If True, will filter out examples so that each frame appears at most once in the dataset.
+            use_actions: If True, will load the actions from the `actions` folder for the models
+            name: the name of the dataset
+        """
+        data_dir = Path(data_dir)
+        with open(data_dir / "metadata.json") as f:
+            self.metadata = json.load(f)
+        shape = (self.metadata["num_images"], self.metadata["h"], self.metadata["w"]) # self.metadata["s"], self.metadata["s"]
+        video_tokens_path, segment_ids_path, action_tokens_path = [data_dir / f"{name}.bin"
+                                                                   for name in ["video", "segment_ids", "actions"]]
+        token_dtype = np.dtype(self.metadata.get("token_dtype", "uint32"))
+        self.data = np.memmap(video_tokens_path, dtype=token_dtype, mode="r", shape=shape)
+        self.window_size, self.stride = window_size, stride
+        if len(name) == 0:
+            self.name = self.metadata["name"]
+        else: # remove later
+            self.name = name
+        if compute_stride_from_freq_table:
+            self.stride = max(DATA_FREQ_TABLE.get(self.name, 1) // natural_hz, 1)
+        print(f"RawTokenDataset: {self.name=} {self.stride=}")
+        self.n_action = self.metadata.get("action_dim", 1) * (self.stride)
+        self.drop_action_ratio = drop_action_ratio
+        if use_actions:
+            actions = []
+            # hack here for the separations in the 1x datasets
+            for action_file in sorted((data_dir / "actions").iterdir()):
+                actions.append(np.memmap(action_file, dtype=np.float32, mode="r").reshape(len(self.data), -1))
+            self.actions = np.concatenate(actions, axis=-1)
+            self.actions, self.action_stat = normalize_actions(self.actions)
+        if os.path.isfile(segment_ids_path):
+            self.segment_ids = np.memmap(
+                segment_ids_path,
+                dtype=np.int32,
+                mode="r",
+                shape=(self.metadata["num_images"],)
+            )
+        else:
+            self.segment_ids = None
+            if filter_interrupts:
+                raise NotImplementedError("Cannot filter interrupted sequences without segment ids.")
+        # Number of frames between the first and last frames of a video sequence (excluding one endpoint frame)
+        self.video_len = (self.window_size - 1) * self.stride
+        self.valid_start_inds = []
+        for start_ind in range(len(self.data) - self.video_len - self.stride):
+            # Assuming `segment_ids` is monotonically increasing, a sequence is interrupted (or too short)
+            # if the first and last frames have different segment ids.
+            if not (filter_interrupts and self.segment_ids[start_ind] != self.segment_ids[start_ind + self.video_len]):
+                self.valid_start_inds.append(start_ind)
+            if  self.segment_ids is not None and self.segment_ids[start_ind] >= max_traj_num: # because we will filter based on window size later
+                # len(self.valid_start_inds) >= max_traj_num
+                break
+        if filter_overlaps:
+            # Instead of using a sliding window, use each frame at most once
+            filtered_start_inds = []
+            for start_ind in self.valid_start_inds:
+                overlapping_start_inds = {start_ind - i * self.stride for i in range(1, self.window_size)}
+                # all sequences from `overlapping_start_inds` will also contain `start_ind`,
+                # so exclude sequence starting from `start_ind` if any of `overlapping_start_inds` is already being used
+                for existing_start_ind in filtered_start_inds[-self.window_size * self.stride:]:
+                    # Bound could be improved
+                    if existing_start_ind in overlapping_start_inds:
+                        break
+                else:
+                    filtered_start_inds.append(start_ind)
+            self.valid_start_inds = filtered_start_inds
+        self.num_videos = len(np.unique(self.valid_start_inds))
+        print(f"Loaded {len(self)} sequences from {data_dir} {self.stride=} {self.window_size=} {self.n_action=} {self.num_videos=}")
+    def __len__(self):
+        return len(self.valid_start_inds)
+    def __getitem__(self, idx):
+        """
+        Returns a flattened sequence of tokens representing `self.window_size` frames,
+        spaced `self.stride` apart.
+        """
+        start_ind = self.valid_start_inds[idx]
+        x = torch.from_numpy((self.data[start_ind : start_ind + self.video_len + 1 : self.stride]).astype(np.int64))
+        x = x.flatten() # 16 x 16 x 16
+        # reconstructions since the input ids and the labels are the same
+        attention_mask = torch.ones_like(x)
+        data_dict = {
+            "input_ids": x,
+            "labels": x,
+            "attention_mask": attention_mask,
+            "h": self.metadata["h"],
+            "w": self.metadata["w"],
+        }
+        if hasattr(self, "actions") and np.random.uniform() > self.drop_action_ratio:
+            # we want to have all actions within the stride to predict the next frame at the end of the stride
+            # we will concatenate the actions from [window_size, d_action] to [window_size, d_action * stride]
+            # S x T x d_action
+            data_dict['action_ids'] = self.actions[start_ind:start_ind + self.video_len + self.stride].reshape(self.window_size, -1)
+            data_dict['action_ids'] = torch.from_numpy(data_dict['action_ids'].astype(np.float32))
+        data_dict["domain"] = self.name
+        return data_dict
+def get_maskgit_collator(config: GenieConfig):
+    mask_token_id = config.image_vocab_size
+    # h = w = math.isqrt(config.S)
+    def collate_fn(features) -> dict[str, torch.Tensor]:
+        # during training, map (z_0, z_1', z_2') -> (null, z_1, z_2)
+        # (z_0, z_1') -> (null, z_1) is the diffusion operator on z_1' -> z_1
+        h = features[0]["h"]
+        w = features[0]["w"]
+        input_ids = torch.stack([ex["input_ids"] for ex in features])
+        device = input_ids.device
+        x_THW = rearrange(input_ids, "b (t h w) -> b t h w", b=len(features), t=config.T,
+                        h=h, w=w)
+        x_THWC = factorize_token_ids(x_THW, config.num_factored_vocabs, config.factored_vocab_size)
+        labels = x_THW.clone()
+        if config.dataloader_apply_corruption:
+            # As done in Copilot-4D paper, add random noise sampled with a random rate between 0% and `config.max_corrupt_rate`
+            r = torch.rand(x_THWC.size(), device=device)
+            u01 = torch.rand((), device=device)
+            random_patches_mask = r < config.max_corrupt_rate * u01
+            random_values = torch.randint(low=0, high=config.factored_vocab_size, size=x_THWC.size(),
+                                        dtype=torch.long, device=device)
+            x_THWC[random_patches_mask] = random_values[random_patches_mask]
+        if random.random() < config.non_mlm_ratio:  # Closer to autoregressive inference
+            # Leave frames [0, first_masked_frame) unmasked.
+            # first_masked_frame = random.randint(config.num_prompt_frames, config.T - 1)
+            first_masked_frame = random.randint(config.num_prompt_frames, config.T - 1)
+            x_THWC_view = x_THWC[:, first_masked_frame:]
+            # Arbitrary numbers here, but corrupting later frames more
+            # since we likely have compounding errors.
+            correct_rate = random.uniform(config.dataloader_mask_ratio_min, 1.0)
+            for i in range(x_THWC_view.size(1)):
+                correct_rate *= random.uniform(0.9, 1.0)
+                r = torch.rand((len(features), h, w, config.num_factored_vocabs), device=device)
+                random_patches_mask = r > correct_rate
+                x_THWC_view[:, i][random_patches_mask] = random_values[:, first_masked_frame + i][random_patches_mask]
+        else:  # Typical MLM masking
+            first_masked_frame = 1
+        mask = torch.zeros(1)
+        if config.dataloader_apply_mask:
+            c = 0
+            while mask.max() == 0:  # We could get unlucky and mask no tokens?
+                # per-minibatch, per-frame masking probability (could try variable masking rate from MUSE)
+                mask_prob_T = cosine_schedule(torch.rand(len(features), config.T - first_masked_frame, 1, 1))
+                r = torch.rand_like(x_THW[:, first_masked_frame:], dtype=torch.float)
+                mask = r < mask_prob_T
+                c += 1
+            if c > 1:
+                print(f"Generated mask {c} > 1 times.")
+            x_THW = unfactorize_token_ids(x_THWC, config.num_factored_vocabs, config.factored_vocab_size)
+            x_THW[:, first_masked_frame:][mask] = mask_token_id
+        data_dict = {
+            "input_ids": rearrange(x_THW, "b t h w -> b (t h w)"),
+            "labels": rearrange(labels, "b t h w -> b (t h w)"),
+        }
+        if "action_ids" in features[0]:
+            data_dict['action_ids'] = torch.stack([ex["action_ids"] for ex in features])
+        data_dict['domain'] = [ex["domain"] for ex in features]
+        data_dict['h'] = [ex["h"] for ex in features]
+        data_dict['w'] = [ex["w"] for ex in features]
+        return data_dict
+    return collate_fn

datasets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

datasets/__init__.py ADDED Viewed

File without changes

datasets/encode_extern_dataset.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import argparse
+import json
+import os
+import time
+import traceback
+from typing import Optional
+import numpy as np
+from tqdm import tqdm
+from datasets.encode_openx_dataset import MIN_VAL_EXAMPLES, MAX_VAL_EXAMPLES, get_shard_inds, VAL_RATIO, \
+    process_dataset_step, DATA_FREQ_TABLE
+from datasets.extern.ego4d import ego4d_dataset_size, ego4d_dataset_generator
+from datasets.extern.egoexo4d import egoexo4d_dataset_size, egoexo4d_dataset_generator
+from datasets.extern.robomimic import robomimic_dataset_generator, robomimic_dataset_size
+from . import utils
+SCRIPT_DESCRIPTION="""
+Similar to encode_openx_dataset.py except for non-OpenX datasets.
+Again, each split can be partitioned into multiple shards,
+which is useful for parallelized encoding across GPUs.
+Example usage:
+    CUDA_VISIBLE_DEVICES=0 python -m datasets.encode_extern_dataset --dataset_name egoexo4d --data_split train --num_shards 1000 --curr_shard_rank 400
+Untested usage (SVD tokenizer):
+CUDA_VISIBLE_DEVICES=0 python -m datasets.encode_extern_dataset --dataset_name robomimic --data_split val --no_quantization --encoder_type temporalvae --encoder_name_or_path 'stabilityai/stable-video-diffusion-img2vid'
+""".strip()
+DATASET_TO_GEN_AND_SIZE = {
+    "ego4d": (ego4d_dataset_generator, ego4d_dataset_size),
+    "egoexo4d": (egoexo4d_dataset_generator, egoexo4d_dataset_size),
+    "robomimic": (robomimic_dataset_generator, robomimic_dataset_size),
+}
+def encode_dataset_split(
+    extern_dataset_name: str,
+    split: str,
+    max_episodes: Optional[int],
+    original_res: bool,
+    no_quantization: bool,
+    curr_shard_rank: int,
+    num_shards: int,
+    root_dir: str,
+    encoder_type: str,
+    encoder_name_or_path: str,
+    dataset_postfix: str = "",
+    no_encoding: bool = False,
+):
+    """
+    Encodes (e.g. tokenizes) dataset.
+    The data written to disk can be used to load a `RawTokenDataset` (or the continuous version.)
+    Args:
+        extern_dataset_name:  TODO
+        split: expected to be either "train" or "val". TODO: decide how to split
+        max_episodes: the maximum number of trajectories to include in the dataset.
+        dataset_postfix: will be a suffix of the output dirname.
+        image_encoder: string specifying the type of image encoder/tokenizer to use.
+        original_res: if True, will maintain original resolution of the video rather than resizing it to 256x256.
+        no_quantization: if True, will not perform quantization step in image encoder.
+    """
+    extern_dataset_name = extern_dataset_name.strip()  # never modified
+    suffixed_dataset_name = extern_dataset_name  # will modify later
+    if original_res:
+        suffixed_dataset_name = f"{suffixed_dataset_name}_originalres"
+    if no_quantization:
+        suffixed_dataset_name = f"{suffixed_dataset_name}_noquant"
+    if no_encoding:
+        suffixed_dataset_name = f"{suffixed_dataset_name}_noencoding"
+    save_dirname = "_".join([suffixed_dataset_name, encoder_type, dataset_postfix, split])
+    dataset_path = os.path.join(root_dir, save_dirname)
+    print("=" * 25)
+    print(f"{dataset_path=}")
+    utils.mkdir_if_missing(dataset_path)
+    # Load data
+    generator, size_func = DATASET_TO_GEN_AND_SIZE[extern_dataset_name]
+    num_examples = size_func()
+    if max_episodes is not None:
+        num_examples = min(num_examples, max_episodes)  # clip num_examples
+    # We will only operate on a subset of the training examples, depending on:
+    #      1) The split (train/val). Some examples are reserved for the other split.
+    #      2) Sharding
+    assert num_examples > MIN_VAL_EXAMPLES  # non-positive number of train examples otherwise
+    num_val_examples = np.clip(int(VAL_RATIO * num_examples), MIN_VAL_EXAMPLES, MAX_VAL_EXAMPLES)
+    if split == "train":  # first_ind inclusive, last_ind exclusive
+        first_split_ind, last_split_ind = num_val_examples, num_examples
+    elif split == "val":
+        first_split_ind, last_split_ind = 0, num_val_examples
+    else:
+        raise NotImplementedError(f"{split=}")
+    first_shard_ind, last_shard_ind = get_shard_inds(first_split_ind, last_split_ind, curr_shard_rank, num_shards)
+    print(f"Total number of examples in {suffixed_dataset_name}: {num_examples}")
+    print(f"Number of examples for {split=}, shard {curr_shard_rank} of {num_shards}: "
+          f"{last_shard_ind - first_shard_ind}. {first_shard_ind=} {last_shard_ind=}")
+    ##### Encode data #####
+    traj_lens = []  # only used to print statistics
+    videos = []  # NOTE: videos/actions for the entire shard are stored in RAM until the end
+    actions = []
+    segment_ids = []
+    # split based on some fixed batch sizes to reset RAM.
+    max_batch_per_loading = 10
+    pbar = tqdm(range(first_shard_ind, last_shard_ind, max_batch_per_loading), position=0, leave=True)
+    start_time = time.time()
+    for start_idx in pbar:
+        end_idx = min(start_idx + max_batch_per_loading, last_shard_ind)
+        pbar.set_description(f"{suffixed_dataset_name} caching episodes: {start_idx}:{end_idx}")
+        ds = generator(range(start_idx, end_idx))
+        for chunk_idx, episode in enumerate(tqdm(ds, position=1, leave=False)):
+            segment_id = start_idx + chunk_idx
+            try:
+                # batchify the data and then process
+                for step_ind, step_data in enumerate(episode["steps"]):
+                    dataset_step = process_dataset_step(
+                        step_data,
+                        encoder_type=encoder_type,
+                        encoder_name_or_path=encoder_name_or_path,
+                        keep_res=original_res,
+                        quantize=not no_quantization,
+                        no_encoding=no_encoding
+                    )
+                    segment_ids.append(segment_id)
+                    videos.append(dataset_step["image"])
+                    actions.append(dataset_step["action"])
+                traj_lens.append(step_ind + 1)  # number of steps in this trajectory
+            except:
+                print("-" * 25)
+                print(f"Add episode failed: {segment_id=}", traceback.format_exc(), suffixed_dataset_name)
+            # 2 day timeout
+            if time.time() - start_time > 86400 * 2:
+                print(f"Writing dataset {suffixed_dataset_name} timed out")
+                break
+    if len(videos) == 0:
+        print("Empty shard!")
+        with open(f"{dataset_path}/error.json", "w") as f:
+            json.dump({"status": "empty_shard"}, f)
+        return
+    if no_quantization:
+        num_channels, height, width = videos[-1].shape[:3]  # num_channels is not actually stored in metadata
+    else:
+        height, width = videos[-1].shape[:2]
+        num_channels = None
+    ##### Write videos, actions, segment_ids, and metadata #####
+    # align format to save segment_ids.bin, video.bin, actions/action.bin, metadata.json
+    # save videos
+    videos = np.stack(videos, axis=0)
+    # fp = np.memmap(f'{dataset_path}/video.bin', dtype=video_dtype, mode='w+', shape=videos.shape)
+    # fp[:] = videos[:]
+    videos.tofile(f'{dataset_path}/video.bin')
+    # save action
+    utils.mkdir_if_missing(f'{dataset_path}/actions')
+    actions = np.stack(actions, axis=0)
+    # fp = np.memmap(f'{dataset_path}/actions/actions.bin', dtype=np.float32, mode='w+', shape=actions.shape)
+    # fp[:] = actions[:]
+    actions = actions.astype(np.float32)
+    actions.tofile(f'{dataset_path}/actions/actions.bin')
+    # save segment_ids
+    segment_ids = np.array(segment_ids)
+    # fp = np.memmap(f'{dataset_path}/segment_ids.bin', dtype=np.int32, mode='w+', shape=segment_ids.shape)
+    # fp[:] = segment_ids[:]  # map to trajectory index
+    segment_ids = segment_ids.astype(np.int32)
+    segment_ids.tofile(f'{dataset_path}/segment_ids.bin')
+    # feature_mean = np.mean(videos)
+    # feature_std = np.std((videos - feature_mean) / 1e9) * 1e9
+    # save metadata
+    if encoder_type == "magvit":
+        vocab_size = int(2 ** 18)
+    elif encoder_type == "temporalvae":
+        vocab_size = None
+    else:
+        raise NotImplementedError(f"{encoder_type=}")
+    with open(f'{dataset_path}/metadata.json', 'w') as f:  # Technically only need to save most of this data for shard 0
+        json.dump({
+            "token_dtype": str(np.dtype(videos.dtype)),
+            "action_dim": actions[0].shape[-1],
+            "s": 16,
+            "h": height,
+            "w": width,
+            "vocab_size": vocab_size,
+            "hz": DATA_FREQ_TABLE.get(extern_dataset_name, 1),  # to be loaded from the data code
+            "encoder_name_or_path": encoder_name_or_path,
+            "encoder_type": encoder_type,
+            "num_images": len(videos),
+            "latent_channels": num_channels,
+            "name": extern_dataset_name,
+            # "feature_mean": feature_mean,
+            # "feature_std": feature_std,
+        }, f)
+    print(f"{len(traj_lens)=} {np.mean(traj_lens)=} {np.sum(traj_lens)=}")
+    print(f"Dataset creation time: {time.time() - start_time:.3f}")
+def parse_args():
+    parser = argparse.ArgumentParser(description=SCRIPT_DESCRIPTION)
+    parser.add_argument(
+        "--dataset_name", type=str, required=True, choices=DATASET_TO_GEN_AND_SIZE.keys(),
+        help="TODO"
+    )
+    parser.add_argument(
+        "--data_split", type=str, choices=["train", "val"], required=True,
+        help="The split of the dataset to create."
+    )
+    parser.add_argument(
+        "--episode_cnt", type=int,
+        help="If specified, will limit the maximum number of trajectories to encode."
+    )
+    parser.add_argument(
+        "--original_res", action='store_true',
+        help="Maintain original resolution of the video rather than resizing it to 256x256."
+    )
+    parser.add_argument(
+        "--no_quantization", action='store_true',
+        help="Skip quantization step in visual encoder."
+    )
+    parser.add_argument(
+        "--num_shards", type=int, default=1,
+        help="The number of shards to partition the train/val dataset into."
+    )
+    parser.add_argument(
+        "--curr_shard_rank", type=int, default=0,
+        help="The (0-indexed) shard number to encode."
+    )
+    parser.add_argument(
+        "--root_dir", type=str, default="data",
+        help="The root directory to write all datasets to."
+    )
+    parser.add_argument(
+        "--encoder_type", type=str, default="magvit", choices=["magvit", "temporalvae"],
+        help="Type of the image tokenizer."
+    )
+    parser.add_argument(
+        "--encoder_name_or_path", type=str, default="data/magvit2.ckpt",
+        help="The path or name of the image encoder."
+    )
+    parser.add_argument(
+        "--no_encoding", action='store_true',
+        help="Preserve the groundtruth raw images to compute metrics in validation."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    utils.set_seed(233)
+    dataset_postfix = f"shard{args.curr_shard_rank}_of_{args.num_shards}"
+    if args.episode_cnt is not None:
+        dataset_postfix = f"max{args.episode_cnt}_{dataset_postfix}"
+    encode_dataset_split(
+        extern_dataset_name=args.dataset_name,
+        split=args.data_split,
+        max_episodes=args.episode_cnt,
+        dataset_postfix=dataset_postfix,
+        original_res=args.original_res,
+        no_quantization=args.no_quantization,
+        num_shards=args.num_shards,
+        curr_shard_rank=args.curr_shard_rank,
+        root_dir=args.root_dir,
+        encoder_type=args.encoder_type,
+        encoder_name_or_path=args.encoder_name_or_path,
+        no_encoding=args.no_encoding,
+    )

datasets/encode_openx_dataset.py ADDED Viewed

	@@ -0,0 +1,459 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import argparse
+import json
+import os
+import time
+import traceback
+from typing import Optional
+import math
+import numpy as np
+import tensorflow_datasets as tfds
+from tensorflow_datasets.core import DatasetBuilder
+from tqdm import tqdm
+from . import utils
+SCRIPT_DESCRIPTION="""
+Converts an Open X-Embodiment dataset from GS to encoded/tokenized data on disk.
+This script only encodes one split (specified by `--data_split`)
+of a one OpenX dataset (specified by `--dataset_name`) at a time.
+Optionally, each split can be partitioned into multiple shards,
+which is useful for parallelized encoding across GPUs.
+Example usage:
+    CUDA_VISIBLE_DEVICES=0 python -m datasets.encode_openx_dataset --dataset_name bc_z --data_split train --episode_cnt 500 --num_shards 16 --curr_shard_rank 0
+    CUDA_VISIBLE_DEVICES=1 python -m datasets.encode_openx_dataset --dataset_name bc_z --data_split train --episode_cnt 500 --num_shards 16 --curr_shard_rank 1
+    set -e
+    for ((i = 0; i < 64; i += 2)); do
+        CUDA_VISIBLE_DEVICES=0 python -m datasets.encode_openx_dataset --dataset_name bridge --data_split train --num_shards 64 --curr_shard_rank $i --root_dir sharded_data
+    done
+    set -e
+    for ((i = 1; i < 64; i += 2)); do
+        CUDA_VISIBLE_DEVICES=1 python -m datasets.encode_openx_dataset --dataset_name bridge --data_split train --num_shards 64 --curr_shard_rank $i --root_dir sharded_data
+    done
+Example usage (SVD tokenizer):
+CUDA_VISIBLE_DEVICES=0 python -m datasets.encode_openx_dataset --dataset_name language_table --data_split val --no_quantization --encoder_type temporalvae --encoder_name_or_path 'stabilityai/stable-video-diffusion-img2vid'
+""".strip()
+# The validation set is the first VAL_RATIO examples in the dataset, and clipped to [MIN_VAL_EXAMPLES, MAX_VAL_EXAMPLES]
+VAL_RATIO = 0.05
+MIN_VAL_EXAMPLES, MAX_VAL_EXAMPLES = 20, 200
+DATA_FREQ_TABLE = {
+  "austin_sailor_dataset_converted_externally_to_rlds": 20,
+  "stanford_hydra_dataset_converted_externally_to_rlds": 10,
+  "austin_buds_dataset_converted_externally_to_rlds": 20,
+  "austin_sirius_dataset_converted_externally_to_rlds": 20,
+  "berkeley_mvp_converted_externally_to_rlds": 5,
+  "berkeley_rpt_converted_externally_to_rlds": 30,
+  "ucsd_kitchen_dataset_converted_externally_to_rlds": 2,
+  "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
+  "utaustin_mutex": 20,
+  "imperialcollege_sawyer_wrist_cam": 10,
+  "language_table": 2, # changed to match frequency
+  "kuka": 2, # changed to match frequency
+  "bc_z": 10,
+  "robo_net": 1,
+  "dlr_sara_pour_converted_externally_to_rlds": 10,
+  "stanford_robocook_converted_externally_to_rlds": 5,
+  "cmu_play_fusion": 5,
+  "bridge": 5,
+  "furniture_bench_dataset_converted_externally_to_rlds": 10,
+  "ucsd_pick_and_place_dataset_converted_externally_to_rlds": 3,
+  "usc_cloth_sim_converted_externally_to_rlds": 10,
+  "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": 20,
+  "roboturk": 10,
+  "kaist_nonprehensile_converted_externally_to_rlds": 10,
+  "asu_table_top_converted_externally_to_rlds": 12,
+  "utokyo_xarm_pick_and_place_converted_externally_to_rlds": 10,
+  "berkeley_cable_routing": 10,
+  "droid": 15,
+  "uiuc_d3field": 1,
+  "robo_set": 5,
+  "toto": 30,
+  "nyu_door_opening_surprising_effectiveness": 3,
+  "nyu_franka_play_dataset_converted_externally_to_rlds": 3,
+  "mimic_play": 15,
+  "maniskill_dataset_converted_externally_to_rlds": 20,
+  "columbia_cairlab_pusht_real": 10,
+  "conq_hose_manipulation": 30,
+  "dlr_edan_shared_control_converted_externally_to_rlds": 5,
+  "berkeley_gnm_sac_son": 10,
+  "berkeley_autolab_ur5": 5,
+  "aloha_mobile": 30,
+  "1x_humanoid": 30,
+  "epic_kitchen_originalres": 30,
+  "epic_kitchen": 30,
+  "exoego4d": 30,
+  "ego4d": 1, # less than this.
+  "robomimic": 6, # average length around 50
+  "metaworld": 6,
+  "frodobot": 30,
+  "fractal20220817_data": 3,
+    # robomimic
+  "robomimic": 6, # average length around 50
+  "robomimic_new": 6, # average length around 50
+  "robomimic_multitask_new": 6, # average length around 50
+  "robomimic_new_perturb": 6, # average length around 50
+  "robomimic_multitask_new_perturb": 6, # average length around 50
+}
+def select_image(observation, verbose=False):
+    """
+    Select a canonical frame as image observation.
+    """
+    imgs = []
+    # does not need to prefer wrist camera
+    for key in ["rgb", "image"]:
+        for obs_key in observation:
+            if key in obs_key and "depth" not in obs_key:
+                image = observation[obs_key]
+                if type(observation[obs_key]) is not np.ndarray:
+                    image = image.numpy()
+                if verbose:
+                    print("selected image key:", obs_key)
+                imgs.append(image)
+    return imgs
+def process_dataset_step(step, encoder_type: str, encoder_name_or_path: str,
+                         keep_res=False, quantize=True, no_encoding=False):
+    """
+    Map dataset-specific keys and values to a unified format.
+    Args:
+        step (dict): The step dictionary containing the dataset-specific information.
+        encoder_type (str, optional): The image encoder to use.
+    Returns:
+        dict: The processed step dictionary with the mapped keys and values.
+    """
+    step_dict = {}
+    try:
+        if "action" in step:
+            step_dict["action"] = np.array(step["action"])
+            # handle action
+            if type(step["action"]) is dict:
+                step_dict["action"] = step_dict["action"].item()
+                # outlier cases
+                action = []
+                for k, v in sorted(step_dict["action"].items()):
+                    action.append(v.numpy().reshape(-1))
+                step_dict["action"] = np.concatenate(action)
+        # handle image
+        images = select_image(step["observation"])
+        # compute the embeddings.
+        if no_encoding:
+            step_dict["image"] = utils.resize_image(images[0])
+        elif quantize:
+            step_dict["image"] = utils.get_quantized_image_embeddings(
+                images[0],
+                encoder_type=encoder_type,
+                encoder_name_or_path=encoder_name_or_path,
+                keep_res=keep_res,
+            )
+        else:
+            step_dict["image"] = utils.get_vae_image_embeddings(
+                images[0],
+                encoder_type=encoder_type,
+                encoder_name_or_path=encoder_name_or_path,
+                keep_res=keep_res,
+            )
+    except Exception as e:
+        print("--------------------------")
+        print("process_dataset_step exception:", traceback.format_exc())
+    return step_dict
+def get_dataset_builder(gs_dataset_name) -> tuple[DatasetBuilder, int]:
+    """
+    Returns the dataset builder and the total number of examples (for the train split).
+    """
+    try:
+        builder = tfds.builder_from_directory(builder_dir=f"gs://gresearch/robotics/{gs_dataset_name}/0.1.0/")
+    except:
+        try:
+            builder = tfds.builder_from_directory(builder_dir=f"gs://gresearch/robotics/{gs_dataset_name}/1.0.0/")
+        except:
+            builder = tfds.builder_from_directory(builder_dir=f"gs://gresearch/robotics/{gs_dataset_name}/0.0.1/")
+    info = builder.info
+    num_examples = info.splits["train"].num_examples
+    return builder, num_examples
+def get_shard_inds(first_split_ind: int, last_split_ind: int, curr_shard_rank: int, num_shards: int) -> tuple[int, int]:
+    """
+    Given the indices of the first (inclusive) and last (exclusive) examples in the data split (i.e. entire train dataset or val dataset),
+    returns the indices of the first (inclusive) and last (exclusive) examples for the current shard in this data split.
+    """
+    split_num_examples = last_split_ind - first_split_ind
+    shard_size_float = split_num_examples / num_shards  # average number of examples per shard
+    return (
+        first_split_ind + math.ceil(curr_shard_rank * shard_size_float),
+        min(first_split_ind + math.ceil((curr_shard_rank + 1) * shard_size_float), last_split_ind)
+    )
+def encode_dataset_split(
+    gs_dataset_name: str,
+    split: str,
+    max_episodes: Optional[int],
+    original_res: bool,
+    no_quantization: bool,
+    curr_shard_rank: int,
+    num_shards: int,
+    root_dir: str,
+    encoder_type: str,
+    encoder_name_or_path: str,
+    dataset_postfix: str = "",
+    no_encoding: bool = False,
+):
+    """
+    Converts an Open X-Embodiment dataset from GS to encoded/tokenized data on disk.
+    The data written to disk can be used to load a `RawTokenDataset` (or the continuous version.)
+    Args:
+        gs_dataset_name: the name of the dataset in Google Storage.
+            Can be checked with gsutil ls -d gs://gresearch/robotics/*/
+        split: expected to be either "train" or "val". TODO: decide how to split
+        max_episodes: the maximum number of trajectories to include in the dataset.
+        dataset_postfix: will be a suffix of the output dirname.
+        image_encoder: string specifying the type of image encoder/tokenizer to use.
+        original_res: if True, will maintain original resolution of the video rather than resizing it to 256x256.
+        no_quantization: if True, will not perform quantization step in image encoder.
+    """
+    gs_dataset_name = gs_dataset_name.strip()  # never modified
+    suffixed_dataset_name = gs_dataset_name  # will modify later
+    if no_quantization:
+        video_dtype = np.float16
+    elif no_encoding:
+        video_dtype = np.uint8
+    else:
+        video_dtype = np.uint32
+    if original_res:
+        suffixed_dataset_name = f"{suffixed_dataset_name}_originalres"
+    if no_quantization:
+        suffixed_dataset_name = f"{suffixed_dataset_name}_noquant"
+    if no_encoding:
+        suffixed_dataset_name = f"{suffixed_dataset_name}_noencoding"
+    save_dirname = "_".join([suffixed_dataset_name, encoder_type, dataset_postfix, split])
+    dataset_path = os.path.join(root_dir, save_dirname)
+    print("=" * 25)
+    print(f"{dataset_path=}")
+    utils.mkdir_if_missing(dataset_path)
+    # Load data
+    builder, num_examples = get_dataset_builder(gs_dataset_name)
+    if max_episodes is not None:
+        num_examples = min(num_examples, max_episodes)  # clip num_examples
+    # We will only operate on a subset of the training examples, depending on:
+    #      1) The split (train/val). Some examples are reserved for the other split.
+    #      2) Sharding
+    assert num_examples > MIN_VAL_EXAMPLES, f"{num_examples=} {MIN_VAL_EXAMPLES=}"  # non-positive number of train examples otherwise
+    num_val_examples = np.clip(int(VAL_RATIO * num_examples), MIN_VAL_EXAMPLES, MAX_VAL_EXAMPLES)
+    if split == "train":  # first_ind inclusive, last_ind exclusive
+        first_split_ind, last_split_ind = num_val_examples, num_examples
+    elif split == "val":
+        first_split_ind, last_split_ind = 0, num_val_examples
+    else:
+        raise NotImplementedError(f"{split=}")
+    first_shard_ind, last_shard_ind = get_shard_inds(first_split_ind, last_split_ind, curr_shard_rank, num_shards)
+    print(f"Total number of examples in {suffixed_dataset_name}: {num_examples}")
+    print(f"Number of examples for {split=}, shard {curr_shard_rank} of {num_shards}: "
+          f"{last_shard_ind - first_shard_ind}. {first_shard_ind=} {last_shard_ind=}")
+    ##### Encode data #####
+    traj_lens = []  # only used to print statistics
+    videos = []  # NOTE: videos/actions for the entire shard are stored in RAM until the end
+    actions = []
+    segment_ids = []
+    # split based on some fixed batch sizes to reset RAM.
+    max_batch_per_loading = 10
+    pbar = tqdm(range(first_shard_ind, last_shard_ind, max_batch_per_loading), position=0, leave=True)
+    start_time = time.time()
+    for start_idx in pbar:
+        end_idx = min(start_idx + max_batch_per_loading, last_shard_ind)
+        pbar.set_description(f"{suffixed_dataset_name} caching episodes: {start_idx}:{end_idx}")
+        ds = builder.as_dataset(split=f"train[{start_idx}:{end_idx}]")
+        for chunk_idx, episode in enumerate(tqdm(ds, position=1, leave=False)):
+            segment_id = start_idx + chunk_idx
+            try:
+                # batchify the data and then process
+                for step_ind, step_data in enumerate(episode["steps"]):
+                    dataset_step = process_dataset_step(
+                        step_data,
+                        encoder_type=encoder_type,
+                        encoder_name_or_path=encoder_name_or_path,
+                        keep_res=original_res,
+                        quantize=not no_quantization,
+                        no_encoding=no_encoding
+                    )
+                    segment_ids.append(segment_id)
+                    videos.append(dataset_step["image"])
+                    actions.append(dataset_step["action"])
+                traj_lens.append(step_ind + 1)  # number of steps in this trajectory
+            except:
+                print("-" * 25)
+                print(f"Add episode failed: {segment_id=}", traceback.format_exc(), suffixed_dataset_name)
+            # 2 day timeout
+            if time.time() - start_time > 86400 * 2:
+                print(f"Writing dataset {suffixed_dataset_name} timed out")
+                break
+    if no_quantization:
+        num_channels, height, width = videos[-1].shape[:3]
+    else:
+        height, width = videos[-1].shape[:2]
+        num_channels = None
+    ##### Write videos, actions, segment_ids, and metadata #####
+    # align format to save segment_ids.bin, video.bin, actions/action.bin, metadata.json
+    # save videos
+    videos = np.stack(videos, axis=0)
+    fp = np.memmap(f'{dataset_path}/video.bin', dtype=video_dtype, mode='w+', shape=videos.shape)
+    fp[:] = videos[:]
+    # save action
+    utils.mkdir_if_missing(f'{dataset_path}/actions')
+    actions = np.stack(actions, axis=0)
+    fp = np.memmap(f'{dataset_path}/actions/actions.bin', dtype=np.float32, mode='w+', shape=actions.shape)
+    fp[:] = actions[:]
+    # save segment_ids
+    segment_ids = np.array(segment_ids)
+    fp = np.memmap(f'{dataset_path}/segment_ids.bin', dtype=np.int32, mode='w+', shape=segment_ids.shape)
+    fp[:] = segment_ids[:]  # map to trajectory index
+    # feature_mean = float(np.mean(videos))
+    # feature_std = float(np.std((videos - feature_mean) / 1e9)) * 1e9
+    # save metadata
+    if encoder_type == "magvit":
+        vocab_size = int(2 ** 18)
+    elif encoder_type == "temporalvae":
+        vocab_size = None
+    else:
+        raise NotImplementedError(f"{encoder_type=}")
+    with open(f'{dataset_path}/metadata.json', 'w') as f:  # Technically only need to save most of this data for shard 0
+        json.dump({
+            "token_dtype": str(np.dtype(videos.dtype)),
+            "action_dim": actions[0].shape[-1],
+            "s": 16,
+            "h": height,
+            "w": width,
+            "vocab_size": vocab_size,
+            "hz": DATA_FREQ_TABLE.get(gs_dataset_name, 1),  # to be loaded from the data code  TODO: remove default?
+            "encoder_name_or_path": encoder_name_or_path,
+            "encoder_type": encoder_type,
+            "num_images": len(videos),
+            "name": gs_dataset_name,
+            "latent_channels": num_channels,
+            "quantized": not args.no_quantization,
+            # "feature_mean": feature_mean,
+            # "feature_std": feature_std,
+        }, f)
+    print(f"{len(traj_lens)=} {np.mean(traj_lens)=} {np.sum(traj_lens)=}")
+    print(f"Dataset creation time: {time.time() - start_time:.3f}")
+def parse_args():
+    parser = argparse.ArgumentParser(description=SCRIPT_DESCRIPTION)
+    parser.add_argument(
+        "--dataset_name", type=str, required=True,
+        help="The name of the Open X-Embodiment dataset on Google Storage. "
+             "Can be checked with gsutil ls -d gs://gresearch/robotics/*/. "
+    )
+    parser.add_argument(
+        "--data_split", type=str, choices=["train", "val"], required=True,
+        help="The split of the dataset to create."
+    )
+    parser.add_argument(
+        "--episode_cnt", type=int,
+        help="If specified, will limit the maximum number of trajectories to encode."
+    )
+    parser.add_argument(
+        "--original_res", action='store_true',
+        help="Maintain original resolution of the video rather than resizing it to 256x256."
+    )
+    parser.add_argument(
+        "--no_quantization", action='store_true',
+        help="Skip quantization step in visual encoder."
+    )
+    parser.add_argument(
+        "--num_shards", type=int, default=1,
+        help="The number of shards to partition the train/val dataset into."
+    )
+    parser.add_argument(
+        "--curr_shard_rank", type=int, default=0,
+        help="The (0-indexed) shard number to encode."
+    )
+    parser.add_argument(
+        "--root_dir", type=str, default="data",
+        help="The root directory to write all datasets to."
+    )
+    parser.add_argument(
+        "--encoder_type", type=str, default="magvit", choices=["magvit", "temporalvae"],
+        help="Type of the image tokenizer."
+    )
+    parser.add_argument(
+        "--encoder_name_or_path", type=str, default="data/magvit2.ckpt",
+        help="The path or name of the image encoder."
+    )
+    parser.add_argument(
+        "--no_encoding", action='store_true',
+        help="Preserve the groundtruth raw images to compute metrics in validation."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    utils.set_seed(233)
+    dataset_postfix = f"shard{args.curr_shard_rank}_of_{args.num_shards}" if args.num_shards > 1 else ""
+    if args.episode_cnt is not None:
+        dataset_postfix = f"max{args.episode_cnt}_{dataset_postfix}" if dataset_postfix else f"max{args.episode_cnt}"
+    encode_dataset_split(
+        gs_dataset_name=args.dataset_name,
+        split=args.data_split,
+        max_episodes=args.episode_cnt,
+        dataset_postfix=dataset_postfix,
+        original_res=args.original_res,
+        no_quantization=args.no_quantization,
+        num_shards=args.num_shards,
+        curr_shard_rank=args.curr_shard_rank,
+        root_dir=args.root_dir,
+        encoder_type=args.encoder_type,
+        encoder_name_or_path=args.encoder_name_or_path,
+        no_encoding=args.no_encoding,
+    )

datasets/extern/__init__.py ADDED Viewed

File without changes

datasets/extern/ego4d.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Iterable
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+import os
+import numpy as np
+from pathlib import Path
+CURRENT_DIR = os.path.dirname(__file__)
+import cv2
+from os.path import expanduser
+import json
+import matplotlib.pyplot as plt
+RESOLUTION = (480, 480)
+home = expanduser("~")
+# Adjust these to the where-ever your detections and frames are stored.
+ROOT = "/datasets01/ego4d_track2/"
+LABEL_ROOT = ROOT + "v2_1/annotations/fho_main.json"
+VIDEO_PATH = ROOT + "v2_1/full_scale/"
+# from epic_kitchens.hoa import load_detections
+# labels = json.load(open("/datasets01/ego4d_track2/v2_1/annotations/fho_main.json"))
+# videos = /datasets01/ego4d_track2/v2_1/clips
+def parse_video_frame(video_path, frame_id):
+    cap = cv2.VideoCapture(video_path)
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id-1)
+    ret, frame = cap.read()
+    return frame
+def parse_raw_video(video_path):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+    return frames
+def compute_state_and_actions(image, curr_frame, next_frame, frame_idx, save=False):
+    # curr_frame is a list of bounding box labels
+    img_width, img_height = image.shape[1], image.shape[0]
+    for box in curr_frame:
+        if box['object_type'] == 'left_hand':
+            curr_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
+        if box['object_type'] == 'right_hand':
+            curr_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
+    for box in next_frame:
+        if box['object_type'] == 'left_hand':
+            next_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
+        if box['object_type'] == 'right_hand':
+            next_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
+    # normalized them
+    curr_hand1_center = np.array([curr_hand1_center[0] / img_width, curr_hand1_center[1] / img_height])
+    curr_hand2_center = np.array([curr_hand2_center[0] / img_width, curr_hand2_center[1] / img_height])
+    # normalize them
+    next_hand1_center = np.array([next_hand1_center[0] / img_width, next_hand1_center[1] / img_height])
+    next_hand2_center = np.array([next_hand2_center[0] / img_width, next_hand2_center[1] / img_height])
+    state = np.concatenate((curr_hand1_center, curr_hand2_center)) #  - np.array(curr_hand1_center)  - np.array(curr_hand2_center)
+    action = np.concatenate(
+        (
+            np.array(next_hand1_center),
+            np.array(next_hand2_center),
+        )
+    )
+    if save:
+        # draw the bounding boxes
+        cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
+        cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
+        cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
+        cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
+        # save the image
+        cv2.imwrite(f"/private/home/xinleic/LR/hpt_video/data/ego4d_video_label_check/img_{frame_idx}.png", image)
+    return state, action
+def parse_raw_video(video_path):
+    import cv2
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+    return frames
+def chunk_actions_and_concatenate(actions):
+    chunk_size = 4
+    chunked_actions = [actions[i:i + chunk_size] for i in range(0, len(actions), chunk_size)][:-1]
+    concatenated_frames = []
+    for chunk in chunked_actions:
+        frames_to_concat = []
+        for action in chunk:
+            frames = action['frames']  # Assuming 'frames' is a list or iterable
+            if frames is not None:
+                frames_to_concat.extend(frames)  # Collect frames from each action
+        concatenated_frames.append(frames_to_concat)  # Store the concatenated frames for this chunk
+    return concatenated_frames
+def ego4d_dataset_size() -> int:
+    """ Returns the number of trajectories in the dataset. ~1725 for Ego4D. """
+    labels = json.load(open(LABEL_ROOT))
+    return len(labels['videos'])
+# define your own dataset conversion
+def ego4d_dataset_generator(example_inds: Iterable[int] = None):
+    """
+    Generator yielding data from Ego4D.
+    Args:
+        example_inds: if specified, will only yield data from these indices.
+            Otherwise, will default to yielding the entire dataset.
+    """
+    # convert to a list of episodes that can be added to replay buffer
+    labels = json.load(open(LABEL_ROOT))
+    if example_inds is None:
+        example_inds = range(len(labels['videos']))
+    for example_ind in example_inds:
+        label = labels['videos'][example_ind]
+        # ['annotated_intervals'][2]['narrated_actions']
+        video_path =  VIDEO_PATH + label['video_uid'] + ".mp4"
+        if not os.path.exists(video_path):
+            print("skip", video_path)
+            continue
+        label_detections = labels
+        print("video_path:", video_path)
+        print("len label detections", len(label_detections))
+        # action extractions over bounding boxes subtractions of both hands.
+        for interval in label['annotated_intervals']:
+            # print(video_detections[frame_idx].hands)
+            lang = "use human hands to do some tasks"  # dummies
+            # import IPython; IPython.embed()
+            print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]")
+            actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
+            print(f"Actions: {len(actions)}")
+            # because we need to concatenate
+            if len(actions) < 3:
+                continue
+            # the number of frames is usually 7 and it also does not follow strict 2hz
+            chunk_actions = chunk_actions_and_concatenate(actions)
+            for frame_idx, frames in enumerate(chunk_actions):
+                # lang = frame['narration_text']
+                steps = []
+                # need to use dummy actions to expand from 6 frames to 16 frames
+                for idx, frame in enumerate(frames[:-1]):
+                    frame_id = frame['frame_number']
+                    next_frame = frames[idx + 1]
+                    image = parse_video_frame(video_path, frame_id)
+                    if len(frame['boxes']) > 2 and len(next_frame['boxes']) > 2:
+                        try:
+                            s, a = compute_state_and_actions(image, frame['boxes'], next_frame['boxes'], idx, save=False)
+                        except:
+                            print(f'compute action failed idx {idx} frame idx {frame_idx}')
+                            continue
+                        # break into step dict
+                        step = {
+                            "observation": {"image": image, "state": s},
+                            "action": a,
+                            "language_instruction": lang,
+                        }
+                        steps.append(OrderedDict(step))
+                if len(steps) < 16:
+                    print("skip this traj because frame window length < 16")
+                    continue
+                data_dict = {"steps": steps}
+                yield data_dict

datasets/extern/egoexo4d.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
+import os
+from typing import Iterable
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+import os
+import numpy as np
+from pathlib import Path
+CURRENT_DIR = os.path.dirname(__file__)
+import cv2
+from os.path import expanduser
+import json
+# Adjust these to the where-ever your detections and frames are stored.
+CAM = "cam01" # cam01
+ROOT = "/datasets01/egoexo4d/v2/"
+LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json"
+VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4"
+# from epic_kitchens.hoa import load_detections
+TAKE_ROOT = ROOT + "takes.json"
+def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False):
+    img_width, img_height = image.shape[1], image.shape[0]
+    # already normalized
+    curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist']
+    curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist']
+    # normalized them
+    curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height])
+    curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height])
+    next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist']
+    next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist']
+    # normalize them
+    next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height])
+    next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height])
+    state = np.concatenate((curr_hand1_center, curr_hand2_center)) #  - np.array(curr_hand1_center)  - np.array(curr_hand2_center)
+    action = np.concatenate(
+        (
+            np.array(next_hand1_center),
+            np.array(next_hand2_center),
+        )
+    )
+    if save:
+        # draw the bounding boxes
+        cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
+        cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
+        cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
+        cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
+        # save the image
+        cv2.imwrite(f"output/inspect/test_{idx}.png", image)
+    return state, action
+def parse_raw_video(video_path):
+    import cv2
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+    return frames
+def egoexo4d_dataset_size() -> int:
+    """ Returns the number of takes in the dataset. ~5k for v2. """
+    takes = json.load(open(TAKE_ROOT))
+    return len(takes)
+# define your own dataset conversion
+def egoexo4d_dataset_generator(example_inds: Iterable[int] = None):
+    """
+    Generator yielding data from Ego-Exo4D.
+    Args:
+        example_inds: if specified, will only yield data from these indices.
+            Otherwise, will default to yielding the entire dataset.
+    """
+    # convert to a list of episodes that can be added to replay buffer
+    MAX_EPISODE_LENGTH = 5000
+    TAKE_FILE = json.load(open(TAKE_ROOT))
+    print("total takes", len(TAKE_FILE))
+    # find the first camera with aria
+    global CAM
+    def find_aria_name(take):
+        for cam in take['cameras']:
+            if 'aria' in cam['name']:
+                return cam['name']
+        return None
+    if example_inds is None:
+        example_inds = range(len(TAKE_FILE))
+    for example_ind in example_inds:
+        take = TAKE_FILE[example_ind]
+        take_name = take['take_name']
+        take_uid = take['take_uid']
+        # CAM = find_aria_name(take)
+        # if CAM is None:
+        #     continue
+        video_path = VIDEO_PATH.format(take_name, CAM)
+        label_path = LABEL_ROOT.format(take_uid)
+        if not os.path.exists(video_path) or not os.path.exists(label_path):
+            continue
+        video_frames = parse_raw_video(video_path)
+        label_detections = json.load(open(label_path))
+        print("video_path:", video_path)
+        print("len video frames", len(video_frames))
+        print("len label detections", len(label_detections))
+        # action extractions over bounding boxes subtractions of both hands.
+        max_frame_idx = len(video_frames) - 1
+        DS_FACTOR = 1
+        frame_idx = 0
+        start_frame_idx = 0
+        MIN_CLIP_LENGTH = 300
+        def get_continuous_chunk(start_idx, label_detections):
+            end_idx = start_idx + 1
+            while  str(start_idx) in label_detections  and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0:
+                end_idx += 1
+            return end_idx
+        print("TAKE", take_name)
+        # some frames might not have label. if there is a gap, skip
+        while start_frame_idx < max_frame_idx - DS_FACTOR:
+            # print(video_detections[frame_idx].hands)
+            lang = "use human hands to do some tasks"  # dummies
+            if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections:
+                start_frame_idx += DS_FACTOR
+                continue
+            end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections)
+            # print("start_frame_idx", start_frame_idx, end_frame_idx)
+            if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH:
+                start_frame_idx = end_frame_idx
+                continue
+            print("start clipping from", start_frame_idx, "to", end_frame_idx)
+            steps = []
+            for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR):
+                image = video_frames[frame_idx][...,[2,1,0]] # RGB
+                try:
+                    s, a = compute_state_and_actions(
+                        image,
+                        label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)],
+                        frame_idx, save=False
+                    )
+                except:
+                    break
+                # break into step dict
+                step = {
+                    "observation": {"image": image, "state": s},
+                    "action": a,
+                    "language_instruction": lang,
+                }
+                steps.append(OrderedDict(step))
+                if len(steps) > MAX_EPISODE_LENGTH:
+                    break
+            start_frame_idx = end_frame_idx
+            if len(steps) < MIN_CLIP_LENGTH:
+                data_dict = {"steps": steps}
+                print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
+                yield data_dict

datasets/extern/epic_kitchen.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
+import os
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+import os
+import numpy as np
+from pathlib import Path
+CURRENT_DIR = os.path.dirname(__file__)
+import cv2
+from os.path import expanduser
+from epic_kitchens.hoa.types import BBox, FloatVector, HandSide
+from epic_kitchens.hoa import load_detections
+RESOLUTION = (480, 480)
+home = expanduser("~")
+# Adjust these to the where-ever your detections and frames are stored.
+DETECTION_ROOT = f"/checkpoint/xinleic/LR/epic-kitchens-100-hand-object-bboxes/labels/hand-objects"
+FRAMES_ROOT = f"/datasets01/EPIC-KITCHENS-100"
+# DETECTION_ROOT = f'{home}/Projects/epic_kitchen_labels/hand-objects'
+# FRAMES_ROOT = f'{home}/EPIC-KITCHENS'
+detections_root = Path(DETECTION_ROOT)
+frames_root = Path(FRAMES_ROOT)
+def compute_state_and_actions(curr_frame, next_frame):
+    curr_hand1, curr_hand2 = curr_frame.hands[0], curr_frame.hands[1]
+    if curr_hand1.side != HandSide.LEFT:  # flip
+        curr_hand1, curr_hand2 = curr_hand2, curr_hand1
+    # already normalized
+    curr_hand1_center = curr_hand1.bbox.center
+    curr_hand2_center = curr_hand2.bbox.center
+    next_hand1, next_hand2 = next_frame.hands[0], next_frame.hands[1]
+    if next_hand1.side != HandSide.LEFT:  # flip
+        next_hand1, next_hand2 = next_hand2, next_hand1
+    # already normalized even
+    next_hand1_center = next_hand1.bbox.center
+    next_hand2_center = next_hand2.bbox.center
+    state = np.concatenate((curr_hand1_center, curr_hand2_center))
+    action = np.concatenate(
+        (
+            np.array(next_hand1_center) - np.array(curr_hand1_center),
+            np.array(next_hand2_center) - np.array(curr_hand2_center),
+        )
+    )
+    return state, action
+# define your own dataset conversion
+def convert_dataset_image():
+    # convert to a list of episodes that can be added to replay buffer
+    ALL_EPISODES = os.listdir(FRAMES_ROOT)
+    MAX_EPISODE_LENGTH = 5000
+    for EPS in ALL_EPISODES:
+        rgb_path = os.path.join(FRAMES_ROOT, EPS, "rgb_frames")
+        if not os.path.exists(rgb_path):
+            continue
+        for video_id in os.listdir(rgb_path):
+            full_path = os.path.join(rgb_path, video_id)
+            if (
+                not full_path.endswith(".tar") and not full_path.endswith(".jpg") and not full_path.endswith("home")
+            ):  # folder
+                # action extractions over bounding boxes subtractions of both hands.
+                participant_id = video_id[:3]
+                video_detections = load_detections(detections_root / participant_id / (video_id + ".pkl"))
+                max_frame_idx = len(video_detections) - 1
+                DS_FACTOR = 1
+                print(full_path)
+                steps = []
+                for frame_idx in range(0, max_frame_idx - DS_FACTOR, DS_FACTOR):
+                    # print(video_detections[frame_idx].hands)
+                    if (
+                        len(video_detections[frame_idx].hands) != 2
+                        or len(video_detections[frame_idx + DS_FACTOR].hands) != 2
+                    ):
+                        continue
+                    s, a = compute_state_and_actions(
+                        video_detections[frame_idx], video_detections[frame_idx + DS_FACTOR]
+                    )
+                    lang = "use human hands to do some tasks"  # dummies
+                    # print("state actions:", s, a)
+                    image_path = frames_root / participant_id / "rgb_frames" / video_id / f"frame_{frame_idx:010d}.jpg"
+                    # print(image_path)
+                    image = cv2.imread(str(image_path))
+                    if image is None:
+                        continue
+                    image = image[..., [2, 1, 0]]  # RGB
+                    # break into step dict
+                    step = {
+                        "observation": {"image": image, "state": s},
+                        "action": a,
+                        "language_instruction": lang,
+                    }
+                    steps.append(OrderedDict(step))
+                    if len(steps) > MAX_EPISODE_LENGTH:
+                        break
+                data_dict = {"steps": steps}
+                print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
+                yield data_dict

datasets/extern/frodobot.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import random
+import os
+import time
+import sys
+import numpy as np
+import IPython
+import torch
+from tqdm import tqdm
+from collections import OrderedDict
+import os
+import PIL.Image
+import numpy as np
+from typing import Union, List
+from pathlib import Path
+import re
+CURRENT_DIR = os.path.dirname(__file__)
+import cv2
+from os.path import expanduser
+import pickle
+import cv2
+from matplotlib import pyplot as plt
+import pandas as pd
+import json
+RESOLUTION = (480, 480)
+DATA = "/home/liruiw/Projects/frodobot/"
+# https://colab.research.google.com/#scrollTo=50ce529a-a20a-4852-9a5a-114b52b98f2e&fileId=https%3A//huggingface.co/datasets/frodobots/FrodoBots-2K/blob/main/helpercode.ipynb
+# #### control data
+import pandas as pd
+# print(f"{dataset_dir}/control_data_{ride_id}.csv")
+def convert_img_dataset(
+    dataset_dir="/home/liruiw/Projects/frodobot/output_rides_22",
+    env_names=None,
+    gui=False,
+    episode_num_pertask=2000,
+    **kwargs,
+):
+    # convert to a list of episodes that can be added to replay buffer
+    for eps_file in os.listdir(dataset_dir)[:50]:  # 50 trajectories
+        dataset_dir_ = os.path.join(dataset_dir, eps_file)
+        if os.path.isdir(dataset_dir_):
+            ride_id = dataset_dir_.split("_")[-2]
+            print(dataset_dir_)
+            ##### control data
+            control = pd.read_csv(f"{dataset_dir_}/control_data_{ride_id}.csv")
+            control_data_dict = control.set_index("timestamp").T.to_dict("list")
+            control_sorted_keys = sorted(list(control_data_dict.keys()))
+            ##### IMU data
+            gyro_data = pd.read_csv(f"{dataset_dir_}/imu_data_{ride_id}.csv")[["timestamp", "gyroscope"]]
+            gyro_data_dict = gyro_data.set_index("timestamp").T.to_dict("list")
+            gyro_sorted_keys = sorted(list(gyro_data_dict.keys()))
+            compass_data = pd.read_csv(f"{dataset_dir_}/imu_data_{ride_id}.csv")[["timestamp", "compass"]]
+            compass_data_dict = compass_data.set_index("timestamp").T.to_dict("list")
+            compass_sorted_keys = sorted(list(compass_data_dict.keys()))
+            accel_data = pd.read_csv(f"{dataset_dir_}/imu_data_{ride_id}.csv")[["timestamp", "accelerometer"]]
+            accel_data_dict = accel_data.set_index("timestamp").T.to_dict("list")
+            accel_sorted_keys = sorted(list(accel_data_dict.keys()))
+            ##### Camera data
+            camera_data = pd.read_csv(f"{dataset_dir_}/front_camera_timestamps_{ride_id}.csv")
+            camera_data_dict = camera_data.set_index("timestamp").T.to_dict("list")
+            camera_sorted_keys = sorted(list(camera_data_dict.keys()))
+            images = sorted(os.listdir(f"{dataset_dir_}/front_camera/"))
+            # #### front camera video
+            # front_camera = f"{dataset_dir}/recordings/0f0e8539d249f38e3ae7b18660f5af8c_ride_39572__uid_s_1000__uid_e_video_20240502221408754.ts"
+            languages = "drive around to play"  # dummy
+            steps = []
+            SUBSAMPLE_IDX = 5
+            for idx, control_t in enumerate(control_sorted_keys):
+                # enumerate along actions and only pick matched timesteps
+                action = control_data_dict[control_t]
+                camera_t = camera_sorted_keys[np.argmin(np.array(camera_sorted_keys) - control_t)]
+                camera_path = images[camera_data_dict[camera_t][0]]
+                img = cv2.resize(cv2.imread(f"{dataset_dir_}/front_camera/{camera_path}"), None, fx=0.5, fy=0.5)
+                gyro = gyro_data_dict[gyro_sorted_keys[np.argmin(np.array(gyro_sorted_keys) - control_t)]]
+                first_three_strings = eval(gyro[0])[0][:3]
+                gyro_array = np.array(first_three_strings, dtype=float)
+                compass = compass_data_dict[compass_sorted_keys[np.argmin(np.array(compass_sorted_keys) - control_t)]]
+                first_three_strings = eval(compass[0])[0][:3]
+                compass_array = np.array(first_three_strings, dtype=float)
+                accel = accel_data_dict[accel_sorted_keys[np.argmin(np.array(accel_sorted_keys) - control_t)]]
+                first_three_strings = eval(accel[0])[0][:3]
+                accel_array = np.array(first_three_strings, dtype=float)
+                prop = np.concatenate((gyro_array, compass_array, accel_array))
+                step = {
+                    "observation": {"state": prop, "image": img},
+                    "action": action,
+                    "language_instruction": languages,
+                }
+                steps.append(OrderedDict(step))
+            data_dict = {"steps": steps}
+            yield data_dict
+class RolloutRunner:
+    """evaluate policy rollouts"""
+    def __init__(self, env_names, episode_num, save_video=False):
+        self.env_names = env_names
+        self.episode_num = episode_num
+        self.envs = []
+        self.scene_files = []
+        self.save_video = save_video
+    @torch.no_grad()
+    def run(self, policy, save_video=False, gui=False, video_postfix="", seed=233, env_name=None, **kwargs):
+        pass

datasets/extern/robomimic.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# --------------------------------------------------------
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+"""
+TODO: explain
+"""
+import h5py
+import numpy as np
+import cv2
+import time
+from collections import OrderedDict
+import robomimic.utils.file_utils as FileUtils
+from sim.robomimic.robomimic_runner import (
+    create_env, OBS_KEYS, RESOLUTION
+)
+from sim.robomimic.robomimic_wrapper import RobomimicLowdimWrapper
+from typing import Optional, Iterable
+DATASET_DIR = 'data/robomimic/datasets'
+SUPPORTED_ENVS = ['lift', 'square', 'can']
+NUM_EPISODES_PER_TASK = 200
+def render_step(env, state):
+    env.env.env.sim.set_state_from_flattened(state)
+    env.env.env.sim.forward()
+    img = env.render()
+    img = cv2.resize(img, RESOLUTION)
+    return img
+def robomimic_dataset_size() -> int:
+    return len(SUPPORTED_ENVS) * NUM_EPISODES_PER_TASK
+def robomimic_dataset_generator(example_inds: Optional[Iterable[int]] = None):
+    if example_inds is None:
+        example_inds = range(robomimic_dataset_size())
+    curr_env_name = None
+    for idx in example_inds:
+        # get env_name corresponding to idx
+        env_name = SUPPORTED_ENVS[idx // NUM_EPISODES_PER_TASK]
+        if curr_env_name is None or curr_env_name != env_name:
+            # need to load new env
+            dataset = f"{DATASET_DIR}/{env_name}/ph/image.hdf5"
+            env_meta = FileUtils.get_env_metadata_from_dataset(dataset)
+            env_meta["use_image_obs"] = True
+            env = create_env(env_meta=env_meta, obs_keys=OBS_KEYS)
+            env = RobomimicLowdimWrapper(env=env)
+            env.reset()     # NOTE: this is necessary to remove green laser bug
+            curr_env_name = env_name
+        with h5py.File(dataset) as file:
+            demos = file["data"]
+            local_episode_idx = idx % NUM_EPISODES_PER_TASK
+            if f"demo_{local_episode_idx}" not in demos:
+                continue
+            demo = demos[f"demo_{local_episode_idx}"]
+            obs = demo["obs"]
+            states = demo["states"]
+            action = demo["actions"][:].astype(np.float32)
+            step_obs = np.concatenate([obs[key] for key in OBS_KEYS], axis=-1).astype(np.float32)
+            steps = []
+            for a, o, s in zip(action, step_obs, states):
+                # break into step dict
+                image = render_step(env, s)
+                step = {
+                    "observation": {"state": o, "image": image},
+                    "action": a,
+                    "language_instruction": f"{env_name}",
+                }
+                steps.append(OrderedDict(step))
+            data_dict = {"steps": steps}
+            yield data_dict
+            # # import imageio
+            # for _ in range(3):
+            #     steps = []
+            #     perturbed_action = action + np.random.normal(0, 0.2, action.shape)
+            #     current_state = states[0]
+            #     _ = render_step(env, current_state)
+            #     for someindex in range(len(action)):
+            #         image = env.render()
+            #         step = {
+            #             "observation": {"image": image},
+            #             "action": action[someindex],
+            #             "language_instruction": f"{env_name}",
+            #         }
+            #         steps.append(OrderedDict(step))
+            #         # simulate action
+            #         env.step(perturbed_action[someindex])
+            #     # # save video
+            #     # frames = [step["observation"]["image"] for step in steps]
+            #     # imageio.mimsave(f"test.mp4", frames, fps=10)
+            #     # while not (user_input := input("Continue? (y/n)")) in ["y", "n"]:
+            #     #     print("Invalid input")
+            #     data_dict = {"steps": steps}
+            #     yield data_dict
+    env.close()

datasets/merge_shards.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Merge data shards generated from `encode_{extern,openx}_dataset.py`
+In addition to CLI args, `SHARD_DATA_FORMAT` must be changed depending on the dataset.
+"""
+import argparse
+import json
+import os
+import numpy as np
+from tqdm.auto import tqdm
+SHARD_DATA_FORMAT = "/private/home/xinleic/LR/HPT-Video-KZ/sharded_data/droid_magvit_shard{}_of_{}_train"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_data_dir", type=str, required=True,
+                        help="Directory to save merged data, must not exist.")
+    parser.add_argument("--num_shards", type=int, required=True, help="Number of shards the dataset was split into.")
+    args = parser.parse_args()
+    assert not os.path.exists(args.out_data_dir), "Will not overwrite existing directory."
+    os.makedirs(os.path.join(args.out_data_dir, "actions"), exist_ok=True)
+    num_frames = 0
+    valid_inds = []
+    for shard_ind in range(args.num_shards):
+        shard_path = SHARD_DATA_FORMAT.format(shard_ind, args.num_shards)
+        if os.path.isfile(os.path.join(shard_path, "metadata.json")):
+            valid_inds.append(shard_ind)
+            with open(os.path.join(shard_path, "metadata.json"), "r") as f:
+                shard_metadata = json.load(f)
+            num_frames += shard_metadata["num_images"]
+        else:
+            print(f"{shard_ind=} is invalid.")
+    if num_frames == 0:
+        print("No valid shards")
+        exit(0)
+    token_dtype = np.dtype(shard_metadata["token_dtype"])
+    if shard_metadata["quantized"]:
+        frame_dims = (shard_metadata["h"], shard_metadata["w"])
+    else:
+        frame_dims = (shard_metadata["latent_channels"], shard_metadata["h"], shard_metadata["w"])
+    action_dim = shard_metadata["action_dim"]
+    videos = np.memmap(
+        os.path.join(args.out_data_dir, "video.bin"),
+        dtype=token_dtype,
+        mode="write",
+        shape=(num_frames, *frame_dims)
+    )
+    actions = np.memmap(
+        os.path.join(args.out_data_dir, "actions", "actions.bin"),
+        dtype=np.float32,
+        mode="write",
+        shape=(num_frames, action_dim)
+    )
+    segment_ids = np.memmap(
+        os.path.join(args.out_data_dir, "segment_ids.bin"),
+        dtype=np.int32,
+        mode="write",
+        shape=(num_frames,)
+    )
+    prev_frame_ind = 0
+    prev_segment_id = 0
+    for shard_ind in tqdm(valid_inds):
+        shard_path = SHARD_DATA_FORMAT.format(shard_ind, args.num_shards)
+        with open(os.path.join(shard_path, "metadata.json"), "r") as f:
+            shard_metadata = json.load(f)
+        shard_num_frames = shard_metadata["num_images"]
+        videos[prev_frame_ind: prev_frame_ind + shard_num_frames] = np.memmap(
+            os.path.join(shard_path, "video.bin"),
+            dtype=np.dtype(shard_metadata["token_dtype"]),
+            mode="r",
+            shape=(shard_num_frames, *frame_dims),
+        )
+        actions[prev_frame_ind: prev_frame_ind + shard_num_frames] = np.memmap(
+            os.path.join(shard_path, "actions", "actions.bin"),
+            dtype=np.float32,
+            mode="r",
+            shape=(shard_num_frames, action_dim),
+        )
+        segment_ids[prev_frame_ind: prev_frame_ind + shard_num_frames] = np.memmap(
+            os.path.join(shard_path, "segment_ids.bin"),
+            dtype=np.int32,
+            mode="r",
+            shape=(shard_num_frames,),
+        ) + prev_segment_id
+        prev_segment_id = segment_ids[prev_frame_ind + shard_num_frames - 1] + 1
+        prev_frame_ind += shard_num_frames
+    assert prev_frame_ind == num_frames
+    print("Finished")
+    with (open(os.path.join(args.out_data_dir, "metadata.json"), "w") as f):
+        merged_metadata = shard_metadata \
+            | vars(args) \
+            | {"num_images": num_frames, "input_path": SHARD_DATA_FORMAT.format(0, args.num_shards)}
+        json.dump(merged_metadata, f)

datasets/utils.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.v2.functional as transforms_f
+from diffusers import AutoencoderKLTemporalDecoder
+from einops import rearrange
+from transformers import T5Tokenizer, T5Model
+from magvit2.config import VQConfig
+from magvit2.models.lfqgan import VQModel
+vision_model = None
+def get_image_encoder(encoder_type: str, encoder_name_or_path: str):
+    encoder_type = encoder_type.lower()
+    if encoder_type == "magvit":
+        return VQModel(VQConfig(), ckpt_path=encoder_name_or_path)
+    elif encoder_type == "temporalvae":
+        return AutoencoderKLTemporalDecoder.from_pretrained(encoder_name_or_path, subfolder="vae")
+    else:
+        raise NotImplementedError(f"{encoder_type=}")
+def set_seed(seed):
+    # set seed for reproducibility
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+def mkdir_if_missing(dst_dir):
+    """make destination folder if it's missing"""
+    if not os.path.exists(dst_dir):
+        os.makedirs(dst_dir)
+def resize_image(image, resize=True):
+    MAX_RES = 1024
+    # convert to array
+    image = np.asarray(image)
+    h, w = image.shape[:2]
+    if h > MAX_RES or w > MAX_RES:
+        if h < w:
+            new_h, new_w = int(MAX_RES * w / h), MAX_RES
+        else:
+            new_h, new_w = MAX_RES, int(MAX_RES * h / w)
+        image = cv2.resize(image, (new_w, new_h))
+    if resize:
+        # resize the shorter side to 256 and then do a center crop
+        h, w = image.shape[:2]
+        if h < w:
+            new_h, new_w = 256, int(256 * w / h)
+        else:
+            new_h, new_w = int(256 * h / w), 256
+        image = cv2.resize(image, (new_w, new_h))
+        h, w = image.shape[:2]
+        crop_h, crop_w = 256, 256
+        start_h = (h - crop_h) // 2
+        start_w = (w - crop_w) // 2
+        image = image[start_h:start_h + crop_h, start_w:start_w + crop_w]
+    return image
+def normalize_image(image, resize=True):
+    """
+    H x W x 3(uint8) -> imagenet normalized 3 x H x W
+    Normalizes image to [-1, 1].
+    Resizes the image if resize=True or if the image resolution > MAX_RES
+    """
+    image = resize_image(image, resize=resize)
+    # normalize between -1 and 1
+    image = image / 255.0
+    image = (image * 2 - 1.)
+    return torch.from_numpy(image.transpose(2, 0, 1))
+def unnormalize_image(magvit_output):
+    """
+    [-1, 1] -> [0, 255]
+    Important: clip to [0, 255]
+    """
+    rescaled_output = ((magvit_output.detach().cpu() + 1) * 127.5)
+    clipped_output = torch.clamp(rescaled_output, 0, 255).to(dtype=torch.uint8)
+    return clipped_output
+@torch.inference_mode()
+@torch.no_grad()
+def get_quantized_image_embeddings(
+    image,
+    encoder_type,
+    encoder_name_or_path,
+    keep_res=False,
+    device="cuda",
+):
+    """
+    image: (h, w, 3)
+    """
+    global vision_model
+    DEBUG = False
+    dtype = torch.bfloat16
+    if vision_model is None:
+        vision_model = get_image_encoder(encoder_type=encoder_type, encoder_name_or_path=encoder_name_or_path)
+        vision_model = vision_model.to(device=device, dtype=dtype)
+        vision_model.eval()
+    batch = normalize_image(image, resize=not keep_res)[None]
+    if not keep_res:
+        img_h, img_w = 256, 256
+    else:
+        img_h, img_w = batch.shape[2:]
+    h, w = img_h // 16, img_w // 16
+    with vision_model.ema_scope():
+        quant_, _, indices, _ = vision_model.encode(batch.to(device=device, dtype=dtype), flip=True)
+    indices = rearrange(indices, "(h w) -> h w", h=h, w=w)
+    # alternative way to get indices
+    # indices_ = vision_model.quantize.bits_to_indices(quant_.permute(0, 2, 3, 1) > 0).cpu().numpy()
+    # indices_ = rearrange(indices_, "(h w) -> h w", h=h, w=w)
+    if DEBUG:
+        # sanity check: decode and then visualize
+        with vision_model.ema_scope():
+            indices = indices[None]
+            # bit representations
+            quant = vision_model.quantize.get_codebook_entry(rearrange(indices, "b h w -> b (h w)"),
+                                                        bhwc=indices.shape + (vision_model.quantize.codebook_dim,)).flip(1)
+                                                        ##  why is there a flip(1) needed for the codebook bits?
+            decoded_img = unnormalize_image(vision_model.decode(quant.to(device=device, dtype=dtype)))
+            transforms_f.to_pil_image(decoded_img[0]).save("decoded.png")
+            transforms_f.to_pil_image(image).save("original.png") # show()
+    # 18 x 16 x 16 of [-1., 1.] - > 16 x 16 uint32
+    indices = indices.type(torch.int32)
+    indices = indices.detach().cpu().numpy().astype(np.uint32)
+    return indices
+@torch.inference_mode()
+@torch.no_grad()
+def get_vae_image_embeddings(
+    image,
+    encoder_type,
+    encoder_name_or_path,
+    keep_res: bool = False,
+    device="cuda",
+):
+    """
+    image: (h, w, 3), in [-1, 1]
+    use SD VAE to encode and decode the images.
+    """
+    global vision_model
+    DEBUG = False
+    dtype = torch.bfloat16
+    if vision_model is None:
+        vision_model = get_image_encoder(encoder_type, encoder_name_or_path)
+        vision_model = vision_model.to(device=device, dtype=dtype)
+        vision_model.eval()
+    # https://github.com/bytedance/IRASim/blob/main/sample/sample_autoregressive.py#L151
+    # if args.use_temporal_decoder:
+    #     vae = AutoencoderKLTemporalDecoder.from_pretrained(args.vae_model_path, subfolder="t2v_required_models/vae_temporal_decoder").to(device)
+    # else:
+    #     vae = AutoencoderKL.from_pretrained(args.vae_model_path, subfolder="vae").to(device)
+    #  x = vae.encode(x).latent_dist.sample().mul_(vae.config.scaling_factor) ?
+    batch = normalize_image(image, resize=not keep_res)[None]
+    if isinstance(vision_model, AutoencoderKLTemporalDecoder):
+        # Think SVD expects images in [-1, 1] so we don't have to change anything?
+        # https://github.com/Stability-AI/generative-models/blob/1659a1c09b0953ad9cc0d480f42e4526c5575b37/scripts/demo/video_sampling.py#L182
+        # https://github.com/Stability-AI/generative-models/blob/1659a1c09b0953ad9cc0d480f42e4526c5575b37/scripts/demo/streamlit_helpers.py#L894
+        z = vision_model.encode(batch.to(device=device, dtype=dtype)).latent_dist.mean
+    elif isinstance(vision_model, VQModel):  # vision_model should be VQModel
+        # with vision_model.ema_scope():  # doesn't matter due to bugged VQModel ckpt_path arg
+        z = vision_model.encode_without_quantize(batch.to(device=device, dtype=dtype))
+    else:
+        raise NotImplementedError(f"{vision_model=}")
+    if DEBUG:
+        decoded_img = unnormalize_image(vision_model.decode(z.to(device=device, dtype=dtype)))
+        transforms_f.to_pil_image(decoded_img[0]).save("decoded_unquant.png")
+        transforms_f.to_pil_image(image).save("original.png")
+    return z[0].detach().cpu().float().numpy().astype(np.float16)
+    # switch to VAE in SD
+    # https://huggingface.co/stabilityai/stable-diffusion-3.5-large/tree/main/vae
+    # https://github.com/bytedance/IRASim/blob/main/sample/sample_autoregressive.py#L151
+    # from diffusers.models import AutoencoderKL,AutoencoderKLTemporalDecoder
+    # vae_model_path = 'pretrained_models/stabilityai/stable-diffusion-xl-base-1.0'
+    # if args.use_temporal_decoder:
+    #     vae = AutoencoderKLTemporalDecoder.from_pretrained(vae_model_path, subfolder="t2v_required_models/vae_temporal_decoder").to(device)
+    # else:
+    #     vae = AutoencoderKL.from_pretrained(vae_model_path, subfolder="vae").to(device)
+    #  z = vae.encode(x).latent_dist.sample().mul_(vae.config.scaling_factor)
+    # if DEBUG:
+    #     decoded_img = unnormalize_image(vae.decode(z.to(device=device, dtype=dtype) / vae.config.scaling_factor))
+    #     transforms_f.to_pil_image(decoded_img[0]).save("decoded_unquant.png")
+    #     transforms_f.to_pil_image(image).save("original.png")
+@torch.no_grad()
+def get_t5_embeddings(language, per_token=True, max_length=16, device="cpu"):
+    """Get T5 embedding"""
+    global global_language_model, t5_tok
+    if global_language_model is None:
+        try:
+            t5_model = T5Model.from_pretrained("t5-base")
+            t5_tok = T5Tokenizer.from_pretrained("t5-base")
+        except:
+            t5_model = T5Model.from_pretrained("t5-base", local_files_only=True)
+            t5_tok = T5Tokenizer.from_pretrained("t5-base", local_files_only=True)
+        t5_model = t5_model.to(device)
+        global_language_model = t5_model
+        global_language_model.eval()
+    # forward pass through encoder only
+    enc = t5_tok(
+        [language],
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        max_length=max_length,
+    ).to(device)
+    output = global_language_model.encoder(
+        input_ids=enc["input_ids"], attention_mask=enc["attention_mask"], return_dict=True
+    )
+    torch.cuda.empty_cache()
+    if per_token:
+        return output.last_hidden_state[0].detach().cpu().numpy()
+    else:
+        # get the final hidden states. average across tokens.
+        emb = output.last_hidden_state[0].mean(dim=0).detach().cpu().numpy()
+        return emb

experiments/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

experiments/datasplit/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

experiments/datasplit/dataset1.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ domains: >
2	+ language_table

experiments/datasplit/dataset10.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+domains: >
+  bridge_data_v2,
+  fractal20220817_data,
+  language_table,
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds,
+  kaist_nonprehensile_converted_externally_to_rlds,
+  ucsd_kitchen_dataset_converted_externally_to_rlds,
+  berkeley_fanuc_manipulation,
+  bc_z,
+  cmu_play_fusion,
+  kuka

experiments/datasplit/dataset15.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+domains: >
+  bridge_data_v2,
+  fractal20220817_data,
+  language_table,
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds,
+  kaist_nonprehensile_converted_externally_to_rlds,
+  ucsd_kitchen_dataset_converted_externally_to_rlds,
+  utokyo_xarm_bimanual_converted_externally_to_rlds,
+  stanford_hydra_dataset_converted_externally_to_rlds,
+  austin_sirius_dataset_converted_externally_to_rlds,
+  berkeley_fanuc_manipulation,
+  berkeley_mvp_converted_externally_to_rlds,
+  berkeley_rpt_converted_externally_to_rlds,
+  cmu_play_fusion,
+  iamlab_cmu_pickup_insert_converted_externally_to_rlds,
+  qut_dexterous_manpulation

experiments/datasplit/dataset15_vae.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+domains: >
+  bridge_data_v2,
+  fractal20220817_data,
+  language_table,
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds,
+  kaist_nonprehensile_converted_externally_to_rlds,
+  ucsd_kitchen_dataset_converted_externally_to_rlds,
+  utokyo_xarm_bimanual_converted_externally_to_rlds,
+  stanford_hydra_dataset_converted_externally_to_rlds,
+  austin_sirius_dataset_converted_externally_to_rlds,
+  berkeley_fanuc_manipulation,
+  berkeley_mvp_converted_externally_to_rlds,
+  berkeley_rpt_converted_externally_to_rlds,
+  cmu_play_fusion,
+  iamlab_cmu_pickup_insert_converted_externally_to_rlds,
+  qut_dexterous_manpulation

experiments/datasplit/dataset20.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+domains: >
+  bridge_data_v2,
+  fractal20220817_data,
+  language_table,
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds,
+  kaist_nonprehensile_converted_externally_to_rlds,
+  ucsd_kitchen_dataset_converted_externally_to_rlds,
+  utokyo_xarm_bimanual_converted_externally_to_rlds,
+  stanford_hydra_dataset_converted_externally_to_rlds,
+  austin_sirius_dataset_converted_externally_to_rlds,
+  berkeley_fanuc_manipulation,
+  berkeley_mvp_converted_externally_to_rlds,
+  cmu_play_fusion,
+  robo_net,
+  furniture_bench_dataset_converted_externally_to_rlds,
+  dlr_sara_grid_clamp_converted_externally_to_rlds,
+  cmu_stretch,
+  droid,
+  toto,
+  bc_z,
+  kuka

experiments/datasplit/dataset20_vae.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+domains: >
+  language_table,
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds,
+  kaist_nonprehensile_converted_externally_to_rlds,
+  ucsd_kitchen_dataset_converted_externally_to_rlds,
+  utokyo_xarm_bimanual_converted_externally_to_rlds,
+  stanford_hydra_dataset_converted_externally_to_rlds,
+  austin_sirius_dataset_converted_externally_to_rlds,
+  berkeley_fanuc_manipulation,
+  berkeley_mvp_converted_externally_to_rlds,
+  berkeley_rpt_converted_externally_to_rlds,
+  cmu_play_fusion,
+  iamlab_cmu_pickup_insert_converted_externally_to_rlds,
+  qut_dexterous_manpulation,
+  robo_net,
+  dlr_sara_grid_clamp_converted_externally_to_rlds,
+  cmu_stretch,
+  columbia_cairlab_pusht_real,
+  droid,
+  toto,
+  io_ai_tech

experiments/datasplit/dataset25.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+domains: >
+  bridge_data_v2,
+  fractal20220817_data,
+  language_table,
+  ucsd_pick_and_place_dataset_converted_externally_to_rlds,
+  kaist_nonprehensile_converted_externally_to_rlds,
+  ucsd_kitchen_dataset_converted_externally_to_rlds,
+  utokyo_xarm_bimanual_converted_externally_to_rlds,
+  stanford_hydra_dataset_converted_externally_to_rlds,
+  austin_sirius_dataset_converted_externally_to_rlds,
+  berkeley_fanuc_manipulation,
+  berkeley_mvp_converted_externally_to_rlds,
+  cmu_play_fusion,
+  iamlab_cmu_pickup_insert_converted_externally_to_rlds,
+  robo_net,
+  furniture_bench_dataset_converted_externally_to_rlds,
+  dlr_sara_grid_clamp_converted_externally_to_rlds,
+  cmu_stretch,
+  droid,
+  toto,
+  io_ai_tech,
+  bc_z,
+  roboturk,
+  cmu_franka_exploration_dataset_converted_externally_to_rlds,
+  nyu_door_opening_surprising_effectiveness,
+  kuka