{ "model_type": "text-to-video", "model_name": "KalaVids", "num_layers": 12, "hidden_size": 768, "vocab_size": 50257, "video_resolution": "1080p", "frame_rate": 30, "num_frames": 300, "num_attention_heads": 12, "intermediate_size": 3072, "hidden_act": "gelu", "initializer_range": 0.02, "layer_norm_eps": 1e-12, "dropout": 0.1, "attention_dropout": 0.1, "num_labels": 2, // For classification tasks "use_cache": true, "bos_token_id": 50256, // Typically the same as vocab_size "eos_token_id": 50256, // Typically the same as vocab_size "pad_token_id": 0, "unk_token_id": 50257, // Typically vocab_size + 1 "special_tokens_map": { "bos_token": "", "eos_token": "", "pad_token": "", "unk_token": "" }, "tokenizer_class": "BertTokenizer", "additional_special_tokens": [ "", "" // Add more special tokens if necessary ], "task_specific_params": { "text-to-video": { // Task-specific parameters for text-to-video generation "video_resolution": "1080p", // Resolution of the generated video "frame_rate": 30, // Frame rate of the generated video "num_frames": 300 // Number of frames in the generated video // Add more task-specific parameters if necessary } }, "device_map": "auto" // Automatically determine the device map for layers }