# coding=utf-8 # Copyright 2023 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ CCT model configuration""" from transformers import PretrainedConfig CCT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "rishabbala/cct_14_7x2_384": "https://huggingface.co./rishabbala/cct_14_7x2_384/blob/main/config.json", } class CctConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`CctModel`]. It is used to instantiate a CCT model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the CCT [rishabbala/cct](https://huggingface.co./rishabbala/cct) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: img_size (`int`, *optional*, defaults to 384): The size of the input image in_channels (`int`, *optional*, defaults to 3): The number of input channels. out_channels (`List[int]`, *optional*, defaults to [64, 384]): The number of output channels of each conv layer. conv_kernel_size (`int`, *optional*, defaults to 7): The kernel size of convolutional layers in patch embedding. conv_stride (`int`, *optional*, defaults to 2): The stride size of convolutional layers in patch embedding. conv_padding (`int`, *optional*, defaults to 3): The padding size of convolutional layers in patch embedding. conv_bias (`bool`, *optional*, defaults to False): Whether the convolutional layers have bias pool_kernel_size (`int`, *optional*, defaults to 7): The kernel size of max pool layers in patch embedding. pool_stride (`int`, *optional*, defaults to 2): The stride size of max pool layers in patch embedding. pool_padding (`int`, *optional*, defaults to 3): The padding size of max pool layers in patch embedding. num_conv_layers (`int`, *optional*, defaults to 2): Number of convolutional embedding layers embed_dim (`int`, *optional*, defaults to 384): Dimension of each of the encoder blocks. num_heads (`int`, *optional*, defaults to 6): Number of attention heads for each attention layer in each block of the Transformer encoder. mlp_ratio (`float`, *optional*, defaults to 3.0): Ratio of the size of the hidden layer compared to the size of the input layer of the FFNs in the encoder blocks. attention_drop_rate (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention probabilities. drop_rate (`float`, *optional*, defaults to 0.0): The dropout ratio following linear projections. drop_path_rate (`float`, *optional*, defaults to `0.0`): The dropout probability for stochastic depth, used in the blocks of the Transformer encoder. num_transformer_layers(`int`, *optional*, defaults to 14): Number of transformer self-attention layers pos_emb_type (`str`, *optional*, defaults to 'learnable'): Type of positional embedding used. Alternative: 'sinusoidal' Example: ```python >>> from transformers import CctConfig, CctModel >>> # Initializing a Cct msft/cct style configuration >>> configuration = CctConfig() >>> # Initializing a model (with random weights) from the msft/cct style configuration >>> model = CctModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "cct" def __init__( self, img_size=384, in_channels=3, out_channels=[64, 384], conv_kernel_size=7, conv_stride=2, conv_padding=3, conv_bias=False, pool_kernel_size=3, pool_stride=2, pool_padding=1, num_conv_layers=2, embed_dim=384, num_heads=6, mlp_ratio=3, attention_drop_rate=0.1, drop_rate=0.0, drop_path_rate=0.0, num_transformer_layers=14, pos_emb_type="learnable", **kwargs, ): super().__init__(**kwargs) self.img_size = img_size self.in_channels = in_channels self.out_channels = out_channels self.num_channels = out_channels[-1] self.conv_kernel_size = conv_kernel_size self.conv_stride = conv_stride self.conv_padding = conv_padding self.conv_bias = conv_bias self.pool_kernel_size = pool_kernel_size self.pool_stride = pool_stride self.pool_padding = pool_padding self.num_conv_layers = num_conv_layers self.embed_dim = embed_dim self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.attention_drop_rate = attention_drop_rate self.drop_rate = drop_rate self.drop_path_rate = drop_path_rate self.num_transformer_layers = num_transformer_layers self.pos_emb_type = pos_emb_type