cct_14_7x2_384 / configuration_cct.py

Upload CctForImageClassification

c54f92e over 1 year ago

5.78 kB

	# coding=utf-8
	# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" CCT model configuration"""

	from transformers import PretrainedConfig

	CCT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
	"rishabbala/cct_14_7x2_384": "https://huggingface.co./rishabbala/cct_14_7x2_384/blob/main/config.json",
	}


	class CctConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`CctModel`]. It is used to instantiate a CCT model
	according to the specified arguments, defining the model architecture. Instantiating a configuration with the
	defaults will yield a similar configuration to that of the CCT
	[rishabbala/cct](https://huggingface.co./rishabbala/cct) architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	img_size (`int`, optional, defaults to 384):
	The size of the input image
	in_channels (`int`, optional, defaults to 3):
	The number of input channels.
	out_channels (`List[int]`, optional, defaults to [64, 384]):
	The number of output channels of each conv layer.
	conv_kernel_size (`int`, optional, defaults to 7):
	The kernel size of convolutional layers in patch embedding.
	conv_stride (`int`, optional, defaults to 2):
	The stride size of convolutional layers in patch embedding.
	conv_padding (`int`, optional, defaults to 3):
	The padding size of convolutional layers in patch embedding.
	conv_bias (`bool`, optional, defaults to False):
	Whether the convolutional layers have bias
	pool_kernel_size (`int`, optional, defaults to 7):
	The kernel size of max pool layers in patch embedding.
	pool_stride (`int`, optional, defaults to 2):
	The stride size of max pool layers in patch embedding.
	pool_padding (`int`, optional, defaults to 3):
	The padding size of max pool layers in patch embedding.
	num_conv_layers (`int`, optional, defaults to 2):
	Number of convolutional embedding layers
	embed_dim (`int`, optional, defaults to 384):
	Dimension of each of the encoder blocks.
	num_heads (`int`, optional, defaults to 6):
	Number of attention heads for each attention layer in each block of the Transformer encoder.
	mlp_ratio (`float`, optional, defaults to 3.0):
	Ratio of the size of the hidden layer compared to the size of the input layer of the FFNs in the encoder
	blocks.
	attention_drop_rate (`float`, optional, defaults to 0.1):
	The dropout ratio for the attention probabilities.
	drop_rate (`float`, optional, defaults to 0.0):
	The dropout ratio following linear projections.
	drop_path_rate (`float`, optional, defaults to `0.0`):
	The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
	num_transformer_layers(`int`, optional, defaults to 14):
	Number of transformer self-attention layers
	pos_emb_type (`str`, optional, defaults to 'learnable'):
	Type of positional embedding used. Alternative: 'sinusoidal'

	Example:

	```python
	>>> from transformers import CctConfig, CctModel

	>>> # Initializing a Cct msft/cct style configuration
	>>> configuration = CctConfig()

	>>> # Initializing a model (with random weights) from the msft/cct style configuration
	>>> model = CctModel(configuration)

	>>> # Accessing the model configuration
	>>> configuration = model.config
	```"""
	model_type = "cct"

	def __init__(
	self,
	img_size=384,
	in_channels=3,
	out_channels=[64, 384],
	conv_kernel_size=7,
	conv_stride=2,
	conv_padding=3,
	conv_bias=False,
	pool_kernel_size=3,
	pool_stride=2,
	pool_padding=1,
	num_conv_layers=2,
	embed_dim=384,
	num_heads=6,
	mlp_ratio=3,
	attention_drop_rate=0.1,
	drop_rate=0.0,
	drop_path_rate=0.0,
	num_transformer_layers=14,
	pos_emb_type="learnable",
	**kwargs,
	):
	super().__init__(**kwargs)
	self.img_size = img_size
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.num_channels = out_channels[-1]
	self.conv_kernel_size = conv_kernel_size
	self.conv_stride = conv_stride
	self.conv_padding = conv_padding
	self.conv_bias = conv_bias
	self.pool_kernel_size = pool_kernel_size
	self.pool_stride = pool_stride
	self.pool_padding = pool_padding
	self.num_conv_layers = num_conv_layers
	self.embed_dim = embed_dim
	self.num_heads = num_heads
	self.mlp_ratio = mlp_ratio
	self.attention_drop_rate = attention_drop_rate
	self.drop_rate = drop_rate
	self.drop_path_rate = drop_path_rate
	self.num_transformer_layers = num_transformer_layers
	self.pos_emb_type = pos_emb_type