import torch

# Load the model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Save the model state dictionary
torch.save(model.state_dict(), "model.pth")