File size: 1,576 Bytes
35c1cfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import torch
import os
import logging

logger = logging.getLogger(__name__)

def partial_freeze_weights(model, original_vocabsize, total_vocabsize):
    if int(os.environ.get("RANK", "0")) == 0:
        logger.info("Only training partial embedding layer")

    trainable_range = (original_vocabsize, total_vocabsize)

    # Define a hook to zero out the gradient for weights outside the trainable range during the backward pass
    def zero_out_gradient(grad):
        grad[:trainable_range[0], :] = 0
        grad[trainable_range[1] + 1:, :] = 0
        return grad

    # Freeze all layers first
    for param in model.parameters():
        param.requires_grad = False

    # Assuming the output layer is `lm_head`
    for param in model.llm.lm_head.parameters():
        # Compute the standard deviation for He initialization
        std_dev = (2.0 / param.size(1)) ** 0.5

        # Initialize the specific rows with He initialization
        param[original_vocabsize:total_vocabsize] = (
            torch.randn((trainable_range[1] - trainable_range[0], param.size(1))) * std_dev
        )
        param.requires_grad = True

        # Register the hook on the weight tensor
        param.register_hook(zero_out_gradient)

def train_embedding_layer_only(model):
    if int(os.environ.get("RANK", "0")) == 0:
        logger.info("Only training embedding layer")

    for param in model.parameters():
        param.requires_grad = False
        
    for param in model.llm.lm_head.parameters():
        param.requires_grad = True