|
--- |
|
datasets: |
|
- ddrg/named_math_formulas |
|
- ddrg/math_formula_retrieval |
|
- ddrg/math_formulas |
|
- ddrg/math_text |
|
--- |
|
Pretrained model based on [microsoft/deberta-v3-base](https://huggingface.co./microsoft/deberta-v3-base) with further mathematical pre-training. |
|
|
|
Compared to deberta-v3-base, 300 additional mathematical LaTeX tokens have been added before the mathematical pre-training. As this additional pre-training used NSP-like tasks, a pooling layer has been added to the model (`bias` and `weight`). If you don't need this pooling layer, just use the standard transformers DeBERTa model. If you want to use the additional pooling layer like the BERT one, a wrapper class like the following may be used: |
|
```python |
|
from typing import Mapping, Any |
|
|
|
import torch |
|
from torch import nn |
|
from transformers import DebertaV2Model, DebertaV2Tokenizer, AutoConfig, AutoTokenizer |
|
|
|
class DebertaV2ModelWithPoolingLayer: |
|
|
|
def __init__(self, pretrained_model_name): |
|
super(DebertaV2ModelWithPoolingLayer, self).__init__() |
|
|
|
# Load the Deberta model and tokenizer |
|
self.deberta = DebertaV2Model.from_pretrained(pretrained_model_name) |
|
self.tokenizer = DebertaV2Tokenizer.from_pretrained(pretrained_model_name) |
|
|
|
# Add a pooling layer (Linear + tanh activation) for the CLS token |
|
self.pooling_layer = nn.Sequential( |
|
nn.Linear(self.deberta.config.hidden_size, self.deberta.config.hidden_size), |
|
nn.Tanh() |
|
) |
|
|
|
self.config = self.deberta.config |
|
self.embeddings = self.deberta.embeddings |
|
|
|
|
|
def forward(self, input_ids, attention_mask, *args, **kwargs): |
|
# Forward pass through the Deberta model |
|
outputs = self.deberta(input_ids, attention_mask=attention_mask, *args, **kwargs) |
|
|
|
# Extract the hidden states from the output |
|
hidden_states = outputs.last_hidden_state |
|
|
|
# Get the CLS token representation (first token) |
|
cls_token = hidden_states[:, 0, :] |
|
|
|
# Apply the pooling layer to the CLS token representation |
|
pooled_output = self.pooling_layer(cls_token) |
|
# Include the pooled_output in the output dictionary as 'pooling_layer' |
|
outputs["pooler_output"] = pooled_output |
|
|
|
return outputs |
|
|
|
def save_pretrained(self, path): |
|
# Save the model's state_dict, configuration, and tokenizer |
|
state_dict = self.deberta.state_dict() |
|
state_dict.update(self.pooling_layer[0].state_dict()) |
|
|
|
torch.save(state_dict, f"{path}/pytorch_model.bin") |
|
self.deberta.config.save_pretrained(path) |
|
self.tokenizer.save_pretrained(path) |
|
|
|
def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True): |
|
pooler_keys = ['bias', 'weight'] |
|
deberta_state_dict = {k: v for k, v in state_dict.items() if k not in pooler_keys} |
|
pooler_state_dict = {k: v for k, v in state_dict.items() if k in pooler_keys} |
|
self.deberta.load_state_dict(deberta_state_dict, strict=strict) |
|
self.pooling_layer[0].load_state_dict(pooler_state_dict) |
|
|
|
@classmethod |
|
def from_pretrained(cls, name): |
|
# Initialize the instance |
|
instance = cls(name) |
|
|
|
try: |
|
# Load the model's state_dict |
|
instance.load_state_dict(torch.load(f"{name}/pytorch_model.bin")) |
|
except FileNotFoundError: |
|
print("Could not find DeBERTa pooling layer. Initialize new values") |
|
|
|
# Load the configuration and tokenizer |
|
instance.deberta.config = AutoConfig.from_pretrained(name) |
|
instance.tokenizer = AutoTokenizer.from_pretrained(name) |
|
|
|
return instance |
|
``` |