marcusinthesky commited on
Commit
5edbd08
·
1 Parent(s): ab2ce92

Conversion script

Browse files
Files changed (1) hide show
  1. convert_open_clip_to_hf.py +265 -0
convert_open_clip_to_hf.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import argparse
17
+ import os.path
18
+
19
+ import torch
20
+
21
+ from open_clip import create_model
22
+ from transformers import CLIPConfig, CLIPVisionConfig, CLIPTextConfig, CLIPModel
23
+
24
+
25
+ def copy_attn_layer(hf_attn_layer, pt_attn_layer):
26
+ assert(hf_attn_layer.num_heads == pt_attn_layer.num_heads)
27
+ q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
28
+ q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
29
+
30
+ hf_attn_layer.q_proj.weight.copy_(q_proj)
31
+ hf_attn_layer.q_proj.bias.copy_(q_proj_bias)
32
+
33
+ hf_attn_layer.k_proj.weight.copy_(k_proj)
34
+ hf_attn_layer.k_proj.bias.copy_(k_proj_bias)
35
+
36
+ hf_attn_layer.v_proj.weight.copy_(v_proj)
37
+ hf_attn_layer.v_proj.bias.copy_(v_proj_bias)
38
+
39
+ hf_attn_layer.out_proj.weight.copy_(pt_attn_layer.out_proj.weight)
40
+ hf_attn_layer.out_proj.bias.copy_(pt_attn_layer.out_proj.bias)
41
+
42
+
43
+ def copy_mlp(hf_mlp, pt_mlp):
44
+ copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
45
+ copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
46
+
47
+
48
+ def copy_linear(hf_linear, pt_linear):
49
+ hf_linear.weight.copy_(pt_linear.weight)
50
+ hf_linear.bias.copy_(pt_linear.bias)
51
+
52
+
53
+ def copy_layer(hf_layer, pt_layer):
54
+ # copy layer norms
55
+ copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
56
+ copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
57
+
58
+ # copy MLP
59
+ copy_mlp(hf_layer.mlp, pt_layer.mlp)
60
+
61
+ # copy attn
62
+ copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
63
+
64
+
65
+ def copy_layers(hf_layers, pt_layers):
66
+ for hf_layer, pt_layer in zip(hf_layers, pt_layers):
67
+ copy_layer(hf_layer, pt_layer)
68
+
69
+
70
+ def copy_encoder(hf_encoder, pt_model):
71
+ # copy embeds
72
+ hf_encoder.embeddings.token_embedding.weight.copy_(pt_model.token_embedding.weight)
73
+ hf_encoder.embeddings.position_embedding.weight.copy_(pt_model.positional_embedding)
74
+
75
+ # copy layer norm
76
+ copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
77
+
78
+ # copy hidden layers
79
+ copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
80
+
81
+
82
+ def copy_text_model_and_projection(hf_model, pt_model):
83
+ # copy projection
84
+ hf_model.text_projection.weight.copy_(pt_model.text_projection.T)
85
+
86
+ # copy text encoder
87
+ copy_encoder(hf_model.text_model, pt_model)
88
+
89
+
90
+ def copy_vison_model_and_projection(hf_model, pt_model):
91
+ # copy projection
92
+ hf_model.visual_projection.weight.copy_(pt_model.visual.proj.T)
93
+
94
+ # copy layer norms
95
+ copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
96
+ copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
97
+
98
+ # copy embeds
99
+ hf_model.vision_model.embeddings.patch_embedding.weight.copy_(pt_model.visual.conv1.weight)
100
+ hf_model.vision_model.embeddings.class_embedding.copy_(pt_model.visual.class_embedding)
101
+ hf_model.vision_model.embeddings.position_embedding.weight.copy_(pt_model.visual.positional_embedding)
102
+
103
+ # copy encoder
104
+ copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
105
+
106
+
107
+ @torch.no_grad()
108
+ def convert_clip_checkpoint(model, pretrained, pytorch_dump_folder_path, config_path=None):
109
+ """
110
+ Copy/paste/tweak model's weights to transformers design.
111
+ """
112
+ if config_path is not None:
113
+ config = CLIPConfig.from_pretrained(config_path)
114
+ else:
115
+ config = CLIPConfig(
116
+ projection_dim=512,
117
+ text_config_dict=dict(hidden_act='gelu'),
118
+ vision_config_dict=dict(hidden_act='gelu'))
119
+
120
+ #CLIPVisionConfig()
121
+ #CLIPTextConfig()
122
+
123
+ # L14
124
+ # config = CLIPConfig(
125
+ # projection_dim=768,
126
+ # text_config_dict=dict(
127
+ # hidden_act='gelu',
128
+ # hidden_size=768,
129
+ # intermediate_size=3072,
130
+ # num_attention_heads=12,
131
+ # ),
132
+ # vision_config_dict=dict(
133
+ # hidden_act='gelu',
134
+ # num_hidden_layers=24,
135
+ # patch_size=14,
136
+ # hidden_size=1024,
137
+ # intermediate_size=4096,
138
+ # num_attention_heads=16,
139
+ # ))
140
+
141
+ ## H14
142
+ #
143
+ # config = CLIPConfig(
144
+ # projection_dim=1024,
145
+ # text_config_dict=dict(
146
+ # hidden_act='gelu',
147
+ # hidden_size=1024,
148
+ # intermediate_size=4096,
149
+ # num_attention_heads=16,
150
+ # num_hidden_layers=24,
151
+ # ),
152
+ # vision_config_dict=dict(
153
+ # hidden_act='gelu',
154
+ # num_hidden_layers=32,
155
+ # patch_size=14,
156
+ # hidden_size=1280,
157
+ # intermediate_size=5120,
158
+ # num_attention_heads=16,
159
+ # ))
160
+
161
+ ## B16 / B16 plus
162
+ config = CLIPConfig(
163
+ projection_dim=512,
164
+ text_config_dict=dict(
165
+ hidden_act='gelu',
166
+ ),
167
+ vision_config_dict=dict(
168
+ hidden_act='gelu',
169
+ num_hidden_layers=12,
170
+ patch_size=16
171
+ ))
172
+
173
+ # config = CLIPConfig(
174
+ # projection_dim=640,
175
+ # text_config_dict=dict(
176
+ # hidden_act='gelu',
177
+ # hidden_size=640,
178
+ # intermediate_size=2560,
179
+ # num_attention_heads=10,
180
+ # ),
181
+ # vision_config_dict=dict(
182
+ # hidden_act='gelu',
183
+ # num_hidden_layers=12,
184
+ # patch_size=16,
185
+ # hidden_size=896,
186
+ # num_attention_heads=14,
187
+ # intermediate_size=3584,
188
+ # image_size=240,
189
+ # ))
190
+
191
+
192
+ # ## g14
193
+ # config = CLIPConfig(
194
+ # projection_dim=1024,
195
+ # text_config_dict=dict(
196
+ # hidden_act='gelu',
197
+ # hidden_size=1024,
198
+ # intermediate_size=4096,
199
+ # num_attention_heads=16,
200
+ # num_hidden_layers=24,
201
+ # ),
202
+ # vision_config_dict=dict(
203
+ # hidden_act='gelu',
204
+ # num_hidden_layers=40,
205
+ # patch_size=14,
206
+ # hidden_size=1408,
207
+ # intermediate_size=6144,
208
+ # num_attention_heads=16,
209
+ # ))
210
+
211
+
212
+ print(config)
213
+ hf_model = CLIPModel(config).eval()
214
+ print(hf_model)
215
+
216
+ pt_model = create_model(model, pretrained=pretrained, precision='fp32')
217
+ pt_model = pt_model.eval()
218
+ print(pt_model)
219
+
220
+ copy_text_model_and_projection(hf_model, pt_model)
221
+ copy_vison_model_and_projection(hf_model, pt_model)
222
+ hf_model.logit_scale = pt_model.logit_scale
223
+
224
+ input_ids = torch.arange(0, 77).unsqueeze(0)
225
+ pixel_values = torch.randn(1, 3, 224, 224)
226
+
227
+ hf_image_embed = hf_model.get_image_features(pixel_values)
228
+ hf_text_embed = hf_model.get_text_features(input_ids)
229
+
230
+ pt_image_embed = pt_model.encode_image(pixel_values)
231
+ pt_text_embed = pt_model.encode_text(input_ids)
232
+ print((pt_image_embed - hf_image_embed).sum())
233
+ print((pt_text_embed - hf_text_embed).sum())
234
+ print((pt_text_embed - hf_text_embed).max(), (pt_text_embed - hf_text_embed).min())
235
+ assert torch.allclose(hf_image_embed, pt_image_embed, atol=1e-4)
236
+ assert torch.allclose(hf_text_embed, pt_text_embed, atol=1e-4)
237
+
238
+
239
+ hf_logits_per_image, hf_logits_per_text = hf_model(
240
+ input_ids=input_ids, pixel_values=pixel_values, return_dict=False
241
+ )[:2]
242
+
243
+ pt_image_features, pt_text_features, logit_scale = pt_model(pixel_values, input_ids)
244
+ pt_logits_per_image = pt_image_features @ pt_text_features.T * logit_scale
245
+ pt_logits_per_text = pt_logits_per_image.T
246
+
247
+ assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-4)
248
+ assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-4)
249
+
250
+ if os.path.exists(pretrained):
251
+ pretrained = os.path.splitext(os.path.basename(pretrained))[0]
252
+
253
+ hf_model.save_pretrained(f'{model}-{pretrained}')
254
+
255
+ torch.save(pt_model.state_dict(), f'{model}-{pretrained}/open_clip_pytorch_model.bin')
256
+
257
+ if __name__ == "__main__":
258
+ parser = argparse.ArgumentParser()
259
+ parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
260
+ parser.add_argument("--model", default=None, type=str, help="Path to fairseq checkpoint")
261
+ parser.add_argument("--pretrained", default=None, type=str, help="Path to fairseq checkpoint")
262
+ parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
263
+ args = parser.parse_args()
264
+
265
+ convert_clip_checkpoint(args.model, args.pretrained, args.pytorch_dump_folder_path, args.config_path)