import torch # Load the model model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Save the model state dictionary torch.save(model.state_dict(), "model.pth")