shunk031 commited on
Commit
7d55fca
·
verified ·
1 Parent(s): a3146ea

Upload AestheticsPredictorV2Linear

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. configuration_predictor.py +39 -0
  3. modeling_v2.py +138 -0
config.json CHANGED
@@ -5,7 +5,7 @@
5
  ],
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
- "AutoModel": "v2.AestheticsPredictorV2Linear"
9
  },
10
  "dropout": 0.0,
11
  "hidden_act": "quick_gelu",
 
5
  ],
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
+ "AutoModel": "modeling_v2.AestheticsPredictorV2Linear"
9
  },
10
  "dropout": 0.0,
11
  "hidden_act": "quick_gelu",
configuration_predictor.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.clip.configuration_clip import CLIPVisionConfig
2
+
3
+
4
+ class AestheticsPredictorConfig(CLIPVisionConfig):
5
+ model_type = "aesthetics_predictor"
6
+
7
+ def __init__(
8
+ self,
9
+ hidden_size: int = 768,
10
+ intermediate_size: int = 3072,
11
+ projection_dim: int = 512,
12
+ num_hidden_layers: int = 12,
13
+ num_attention_heads: int = 12,
14
+ num_channels: int = 3,
15
+ image_size: int = 224,
16
+ patch_size: int = 32,
17
+ hidden_act: str = "quick_gelu",
18
+ layer_norm_eps: float = 0.00001,
19
+ attention_dropout: float = 0,
20
+ initializer_range: float = 0.02,
21
+ initializer_factor: float = 1,
22
+ **kwargs,
23
+ ):
24
+ super().__init__(
25
+ hidden_size,
26
+ intermediate_size,
27
+ projection_dim,
28
+ num_hidden_layers,
29
+ num_attention_heads,
30
+ num_channels,
31
+ image_size,
32
+ patch_size,
33
+ hidden_act,
34
+ layer_norm_eps,
35
+ attention_dropout,
36
+ initializer_range,
37
+ initializer_factor,
38
+ **kwargs,
39
+ )
modeling_v2.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Dict, Final, Optional, Tuple, Union
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import CLIPVisionModelWithProjection, logging
7
+ from transformers.modeling_outputs import ImageClassifierOutputWithNoAttention
8
+
9
+ from .configuration_predictor import AestheticsPredictorConfig
10
+
11
+ logging.set_verbosity_error()
12
+
13
+ URLS_LINEAR: Final[Dict[str, str]] = {
14
+ "sac+logos+ava1-l14-linearMSE": "https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac%2Blogos%2Bava1-l14-linearMSE.pth",
15
+ "ava+logos-l14-linearMSE": "https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/ava%2Blogos-l14-linearMSE.pth",
16
+ }
17
+
18
+
19
+ URLS_RELU: Final[Dict[str, str]] = {
20
+ "ava+logos-l14-reluMSE": "https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/ava%2Blogos-l14-reluMSE.pth",
21
+ }
22
+
23
+
24
+ class AestheticsPredictorV2Linear(CLIPVisionModelWithProjection):
25
+ def __init__(self, config: AestheticsPredictorConfig) -> None:
26
+ super().__init__(config)
27
+ self.layers = nn.Sequential(
28
+ nn.Linear(config.projection_dim, 1024),
29
+ nn.Dropout(0.2),
30
+ nn.Linear(1024, 128),
31
+ nn.Dropout(0.2),
32
+ nn.Linear(128, 64),
33
+ nn.Dropout(0.1),
34
+ nn.Linear(64, 16),
35
+ nn.Linear(16, 1),
36
+ )
37
+ self.post_init()
38
+
39
+ def forward(
40
+ self,
41
+ pixel_values: Optional[torch.FloatTensor] = None,
42
+ output_attentions: Optional[bool] = None,
43
+ output_hidden_states: Optional[bool] = None,
44
+ labels: Optional[torch.Tensor] = None,
45
+ return_dict: Optional[bool] = None,
46
+ ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
47
+ return_dict = (
48
+ return_dict if return_dict is not None else self.config.use_return_dict
49
+ )
50
+
51
+ outputs = super().forward(
52
+ pixel_values=pixel_values,
53
+ output_attentions=output_attentions,
54
+ output_hidden_states=output_hidden_states,
55
+ return_dict=return_dict,
56
+ )
57
+ image_embeds = outputs[0] # image_embeds
58
+ image_embeds /= image_embeds.norm(dim=-1, keepdim=True)
59
+
60
+ prediction = self.layers(image_embeds)
61
+
62
+ loss = None
63
+ if labels is not None:
64
+ loss_fct = nn.MSELoss()
65
+ loss = loss_fct()
66
+
67
+ if not return_dict:
68
+ return (loss, prediction, image_embeds)
69
+
70
+ return ImageClassifierOutputWithNoAttention(
71
+ loss=loss,
72
+ logits=prediction,
73
+ hidden_states=image_embeds,
74
+ )
75
+
76
+
77
+ class AestheticsPredictorV2ReLU(AestheticsPredictorV2Linear):
78
+ def __init__(self, config: AestheticsPredictorConfig) -> None:
79
+ super().__init__(config)
80
+ self.layers = nn.Sequential(
81
+ nn.Linear(config.projection_dim, 1024),
82
+ nn.ReLU(),
83
+ nn.Dropout(0.2),
84
+ nn.Linear(1024, 128),
85
+ nn.ReLU(),
86
+ nn.Dropout(0.2),
87
+ nn.Linear(128, 64),
88
+ nn.ReLU(),
89
+ nn.Dropout(0.1),
90
+ nn.Linear(64, 16),
91
+ nn.ReLU(),
92
+ nn.Linear(16, 1),
93
+ )
94
+ self.post_init()
95
+
96
+
97
+ def convert_v2_linear_from_openai_clip(
98
+ predictor_head_name: str,
99
+ openai_model_name: str = "openai/clip-vit-large-patch14",
100
+ ) -> AestheticsPredictorV2Linear:
101
+ model = AestheticsPredictorV2Linear.from_pretrained(openai_model_name)
102
+
103
+ state_dict = torch.hub.load_state_dict_from_url(
104
+ URLS_LINEAR[predictor_head_name], map_location="cpu"
105
+ )
106
+ assert isinstance(state_dict, OrderedDict)
107
+
108
+ # remove `layers.` from the key of the state_dict
109
+ state_dict = OrderedDict(
110
+ ((k.replace("layers.", ""), v) for k, v in state_dict.items())
111
+ )
112
+ model.layers.load_state_dict(state_dict)
113
+
114
+ model.eval()
115
+
116
+ return model
117
+
118
+
119
+ def convert_v2_relu_from_openai_clip(
120
+ predictor_head_name: str,
121
+ openai_model_name: str = "openai/clip-vit-large-patch14",
122
+ ) -> AestheticsPredictorV2ReLU:
123
+ model = AestheticsPredictorV2ReLU.from_pretrained(openai_model_name)
124
+
125
+ state_dict = torch.hub.load_state_dict_from_url(
126
+ URLS_RELU[predictor_head_name], map_location="cpu"
127
+ )
128
+ assert isinstance(state_dict, OrderedDict)
129
+
130
+ # remove `layers.` from the key of the state_dict
131
+ state_dict = OrderedDict(
132
+ ((k.replace("layers.", ""), v) for k, v in state_dict.items())
133
+ )
134
+ model.layers.load_state_dict(state_dict)
135
+
136
+ model.eval()
137
+
138
+ return model