Spaces:
Runtime error
Runtime error
File size: 1,576 Bytes
35c1cfd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import torch
import os
import logging
logger = logging.getLogger(__name__)
def partial_freeze_weights(model, original_vocabsize, total_vocabsize):
if int(os.environ.get("RANK", "0")) == 0:
logger.info("Only training partial embedding layer")
trainable_range = (original_vocabsize, total_vocabsize)
# Define a hook to zero out the gradient for weights outside the trainable range during the backward pass
def zero_out_gradient(grad):
grad[:trainable_range[0], :] = 0
grad[trainable_range[1] + 1:, :] = 0
return grad
# Freeze all layers first
for param in model.parameters():
param.requires_grad = False
# Assuming the output layer is `lm_head`
for param in model.llm.lm_head.parameters():
# Compute the standard deviation for He initialization
std_dev = (2.0 / param.size(1)) ** 0.5
# Initialize the specific rows with He initialization
param[original_vocabsize:total_vocabsize] = (
torch.randn((trainable_range[1] - trainable_range[0], param.size(1))) * std_dev
)
param.requires_grad = True
# Register the hook on the weight tensor
param.register_hook(zero_out_gradient)
def train_embedding_layer_only(model):
if int(os.environ.get("RANK", "0")) == 0:
logger.info("Only training embedding layer")
for param in model.parameters():
param.requires_grad = False
for param in model.llm.lm_head.parameters():
param.requires_grad = True |