cct_14_7x2_384 / configuration_cct.py
rishabbala's picture
Upload CctForImageClassification
c54f92e
raw
history blame
5.78 kB
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CCT model configuration"""
from transformers import PretrainedConfig
CCT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"rishabbala/cct_14_7x2_384": "https://huggingface.co./rishabbala/cct_14_7x2_384/blob/main/config.json",
}
class CctConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CctModel`]. It is used to instantiate a CCT model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the CCT
[rishabbala/cct](https://huggingface.co./rishabbala/cct) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
img_size (`int`, *optional*, defaults to 384):
The size of the input image
in_channels (`int`, *optional*, defaults to 3):
The number of input channels.
out_channels (`List[int]`, *optional*, defaults to [64, 384]):
The number of output channels of each conv layer.
conv_kernel_size (`int`, *optional*, defaults to 7):
The kernel size of convolutional layers in patch embedding.
conv_stride (`int`, *optional*, defaults to 2):
The stride size of convolutional layers in patch embedding.
conv_padding (`int`, *optional*, defaults to 3):
The padding size of convolutional layers in patch embedding.
conv_bias (`bool`, *optional*, defaults to False):
Whether the convolutional layers have bias
pool_kernel_size (`int`, *optional*, defaults to 7):
The kernel size of max pool layers in patch embedding.
pool_stride (`int`, *optional*, defaults to 2):
The stride size of max pool layers in patch embedding.
pool_padding (`int`, *optional*, defaults to 3):
The padding size of max pool layers in patch embedding.
num_conv_layers (`int`, *optional*, defaults to 2):
Number of convolutional embedding layers
embed_dim (`int`, *optional*, defaults to 384):
Dimension of each of the encoder blocks.
num_heads (`int`, *optional*, defaults to 6):
Number of attention heads for each attention layer in each block of the Transformer encoder.
mlp_ratio (`float`, *optional*, defaults to 3.0):
Ratio of the size of the hidden layer compared to the size of the input layer of the FFNs in the encoder
blocks.
attention_drop_rate (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
drop_rate (`float`, *optional*, defaults to 0.0):
The dropout ratio following linear projections.
drop_path_rate (`float`, *optional*, defaults to `0.0`):
The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
num_transformer_layers(`int`, *optional*, defaults to 14):
Number of transformer self-attention layers
pos_emb_type (`str`, *optional*, defaults to 'learnable'):
Type of positional embedding used. Alternative: 'sinusoidal'
Example:
```python
>>> from transformers import CctConfig, CctModel
>>> # Initializing a Cct msft/cct style configuration
>>> configuration = CctConfig()
>>> # Initializing a model (with random weights) from the msft/cct style configuration
>>> model = CctModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "cct"
def __init__(
self,
img_size=384,
in_channels=3,
out_channels=[64, 384],
conv_kernel_size=7,
conv_stride=2,
conv_padding=3,
conv_bias=False,
pool_kernel_size=3,
pool_stride=2,
pool_padding=1,
num_conv_layers=2,
embed_dim=384,
num_heads=6,
mlp_ratio=3,
attention_drop_rate=0.1,
drop_rate=0.0,
drop_path_rate=0.0,
num_transformer_layers=14,
pos_emb_type="learnable",
**kwargs,
):
super().__init__(**kwargs)
self.img_size = img_size
self.in_channels = in_channels
self.out_channels = out_channels
self.num_channels = out_channels[-1]
self.conv_kernel_size = conv_kernel_size
self.conv_stride = conv_stride
self.conv_padding = conv_padding
self.conv_bias = conv_bias
self.pool_kernel_size = pool_kernel_size
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.num_conv_layers = num_conv_layers
self.embed_dim = embed_dim
self.num_heads = num_heads
self.mlp_ratio = mlp_ratio
self.attention_drop_rate = attention_drop_rate
self.drop_rate = drop_rate
self.drop_path_rate = drop_path_rate
self.num_transformer_layers = num_transformer_layers
self.pos_emb_type = pos_emb_type