suayptalha
/

minGRU-LM

@@ -8,7 +8,6 @@ from .configuration_minGRULM import MinGRULMConfig
 from minGRU_pytorch.minGRULM import minGRULM
-# Wrapper class for device compatibility
 class MinGRULMWrapped(nn.Module):
     def __init__(self, min_gru_model):
         super().__init__()
@@ -16,13 +15,11 @@ class MinGRULMWrapped(nn.Module):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     def forward(self, *args, **kwargs):
-        # Move input tensors to the correct device
         args = [arg.to(self.device) if isinstance(arg, torch.Tensor) else arg for arg in args]
         kwargs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
         return self.min_gru_model(*args, **kwargs)
     def to(self, device):
-        # Update device information
         self.device = device
         self.min_gru_model.to(device)
         return self
@@ -47,11 +44,10 @@ class MinGRULMPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        # NaN kontrolü: Tüm parametrelerde NaN varsa, `torch.nan_to_num` kullanarak düzeltme
         for name, param in module.named_parameters():
             if torch.isnan(param).any():
                 print(f"NaN detected in parameter {name}. Replacing with a safe number.")
-                param.data = torch.nan_to_num(param.data, nan=1e-6)  # NaN'ları 1e-6 ile değiştiriyoruz
 class MinGRULMForCausalLM(PreTrainedModel):
@@ -61,7 +57,6 @@ class MinGRULMForCausalLM(PreTrainedModel):
     def __init__(self, config: MinGRULMConfig):
         super().__init__(config)
-        # Load model from minGRULM library and wrap it
         raw_min_gru = minGRULM(
             num_tokens=config.vocab_size,
             dim=config.d_model,
@@ -72,18 +67,15 @@ class MinGRULMForCausalLM(PreTrainedModel):
         )
         self.model = MinGRULMWrapped(raw_min_gru)
-        # Language modeling head
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.post_init()
     def post_init(self):
-        # Ensure tied weights and any additional setup
         super().post_init()
         self.tie_weights()
     def tie_weights(self):
-        # Tie lm_head weights to the embedding layer weights
         self.lm_head.weight = self.model.min_gru_model.token_emb.weight
     def get_input_embeddings(self):
@@ -96,17 +88,14 @@ class MinGRULMForCausalLM(PreTrainedModel):
         return self.lm_head
     def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs):
-        # Ensure that inputs for generation are properly handled
         return {"input_ids": input_ids, "attention_mask": kwargs.get("attention_mask", None)}
     def forward(self, input_ids: torch.LongTensor, labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = True, **kwargs):
-        # Forward pass through the wrapped model
         logits = self.model(input_ids)
-        # NaN kontrolü: Eğer logits'te NaN varsa, `torch.nan_to_num` kullanarak düzeltme
         if torch.isnan(logits).any():
             print("NaN detected in logits! Replacing with a safe number.")
-            logits = torch.nan_to_num(logits, nan=1e-6)  # NaN'ları 1e-6 ile değiştiriyoruz
         loss = None
         if labels is not None:
@@ -118,10 +107,9 @@ class MinGRULMForCausalLM(PreTrainedModel):
                 shift_labels.view(-1),
             )
-            # NaN kontrolü: Eğer loss'ta NaN varsa, `torch.nan_to_num` kullanarak düzeltme
             if torch.isnan(loss).any():
                 print("NaN detected in loss! Replacing with a safe number.")
-                loss = torch.nan_to_num(loss, nan=1e-6)  # NaN'ları 1e-6 ile değiştiriyoruz
         if not return_dict:
             return (loss, logits) if loss is not None else (logits,)
@@ -148,15 +136,11 @@ class MinGRULMForCausalLM(PreTrainedModel):
             save_directory (str): Directory to save the model.
             safe_serialization (bool, optional): Whether to use safe serialization. Defaults to True.
         """
-        # Create the save directory if it doesn't exist
         os.makedirs(save_directory, exist_ok=True)
-        # Check if safe_serialization is enabled
         if safe_serialization:
             print("Saving with safe serialization.")
-            # Save the model's state_dict (model weights)
-            #state_dict = self.state_dict()
             state_dict = {}
             for name, param in self.model.min_gru_model.named_parameters():
@@ -168,9 +152,7 @@ class MinGRULMForCausalLM(PreTrainedModel):
             state_dict['config'] = self.config.__dict__
             torch.save(state_dict, os.path.join(save_directory, "pytorch_model.bin"))
-            # Save the configuration
             self.config.save_pretrained(save_directory)
         else:
             print("Saving without safe serialization.")
-            # If not safe_serialization, use the default save mechanism from the base class
             super().save_pretrained(save_directory)

 from minGRU_pytorch.minGRULM import minGRULM
 class MinGRULMWrapped(nn.Module):
     def __init__(self, min_gru_model):
         super().__init__()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     def forward(self, *args, **kwargs):
         args = [arg.to(self.device) if isinstance(arg, torch.Tensor) else arg for arg in args]
         kwargs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
         return self.min_gru_model(*args, **kwargs)
     def to(self, device):
         self.device = device
         self.min_gru_model.to(device)
         return self
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         for name, param in module.named_parameters():
             if torch.isnan(param).any():
                 print(f"NaN detected in parameter {name}. Replacing with a safe number.")
+                param.data = torch.nan_to_num(param.data, nan=1e-6)
 class MinGRULMForCausalLM(PreTrainedModel):
     def __init__(self, config: MinGRULMConfig):
         super().__init__(config)
         raw_min_gru = minGRULM(
             num_tokens=config.vocab_size,
             dim=config.d_model,
         )
         self.model = MinGRULMWrapped(raw_min_gru)
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.post_init()
     def post_init(self):
         super().post_init()
         self.tie_weights()
     def tie_weights(self):
         self.lm_head.weight = self.model.min_gru_model.token_emb.weight
     def get_input_embeddings(self):
         return self.lm_head
     def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs):
         return {"input_ids": input_ids, "attention_mask": kwargs.get("attention_mask", None)}
     def forward(self, input_ids: torch.LongTensor, labels: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = True, **kwargs):
         logits = self.model(input_ids)
         if torch.isnan(logits).any():
             print("NaN detected in logits! Replacing with a safe number.")
+            logits = torch.nan_to_num(logits, nan=1e-6)
         loss = None
         if labels is not None:
                 shift_labels.view(-1),
             )
             if torch.isnan(loss).any():
                 print("NaN detected in loss! Replacing with a safe number.")
+                loss = torch.nan_to_num(loss, nan=1e-6)
         if not return_dict:
             return (loss, logits) if loss is not None else (logits,)
             save_directory (str): Directory to save the model.
             safe_serialization (bool, optional): Whether to use safe serialization. Defaults to True.
         """
         os.makedirs(save_directory, exist_ok=True)
         if safe_serialization:
             print("Saving with safe serialization.")
             state_dict = {}
             for name, param in self.model.min_gru_model.named_parameters():
             state_dict['config'] = self.config.__dict__
             torch.save(state_dict, os.path.join(save_directory, "pytorch_model.bin"))
             self.config.save_pretrained(save_directory)
         else:
             print("Saving without safe serialization.")
             super().save_pretrained(save_directory)