File size: 5,665 Bytes

6cd1d54

import gguf
import numpy as np
from sklearn.decomposition import PCA
import tqdm


def load_hidden_states(path):
    
    print("\nyour mom\n")
    '''Load hidden states produced by the llama.cpp ./repeng tool.'''
    gguf_file = gguf.GGUFReader(path)
    print("\nyour dad\n")

    hidden_states = {}
    for t in gguf_file.tensors:
        if not t.name.startswith('l_out-'):
            continue
        layer = int(t.name[len('l_out-'):])
        assert layer not in hidden_states, 'duplicate hidden states for layer %d' % layer
        data = t.data.reshape((t.shape[1], t.shape[0]))
        hidden_states[layer] = data

    return hidden_states

def project_onto_direction(H, direction):
    """Project matrix H (n, d_1) onto direction vector (d_2,)"""
    mag = np.linalg.norm(direction)
    assert not np.isinf(mag)
    return (H @ direction) / mag

def read_representations(
    layer_hiddens: dict[int, np.ndarray],
) -> dict[int, np.ndarray]:
    """
    Extract the representations based on the contrast dataset.
    """

    hidden_layers = sorted(layer_hiddens.keys())
    num_inputs = next(iter(layer_hiddens.values())).shape[0] // 2
    print('%d inputs' % num_inputs)

    # get differences between (positive, negative) pairs
    relative_layer_hiddens = {}
    for layer in hidden_layers:
        relative_layer_hiddens[layer] = (
            layer_hiddens[layer][::2] - layer_hiddens[layer][1::2]
        )

    # get directions for each layer using PCA
    directions: dict[int, np.ndarray] = {}
    for layer in tqdm.tqdm(hidden_layers):
        assert layer_hiddens[layer].shape[0] == num_inputs * 2

        # fit layer directions
        train = np.vstack(
            relative_layer_hiddens[layer]
            - relative_layer_hiddens[layer].mean(axis=0, keepdims=True)
        )
        pca_model = PCA(n_components=1, whiten=False).fit(train)
        # shape (n_features,)
        directions[layer] = pca_model.components_.astype(np.float32).squeeze(axis=0)

        # calculate sign
        projected_hiddens = project_onto_direction(
            layer_hiddens[layer], directions[layer]
        )

        # order is [positive, negative, positive, negative, ...]
        positive_smaller_mean = np.mean(
            [
                projected_hiddens[i] < projected_hiddens[i + 1]
                for i in range(0, num_inputs * 2, 2)
            ]
        )
        positive_larger_mean = np.mean(
            [
                projected_hiddens[i] > projected_hiddens[i + 1]
                for i in range(0, num_inputs * 2, 2)
            ]
        )

        if positive_smaller_mean > positive_larger_mean:  # type: ignore
            directions[layer] *= -1

    return directions

def export_gguf(directions, path: str):
    """
    Export a trained ControlVector to a llama.cpp .gguf file.
    """

    arch = "controlvector"
    writer = gguf.GGUFWriter(path, arch)
    #writer.add_string(f"{arch}.model_hint", model_type)
    #writer.add_uint32(f"{arch}.layer_count", len(directions))
    for layer in directions.keys():
        if layer == 0:
            # For some reason, llama.cpp bails out if it sees a direction.0
            # tensor.
            continue
        writer.add_tensor(f"direction.{layer}", directions[layer])
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_tensors_to_file()
    writer.close()

def test_model(model_name, directions):
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from repeng import ControlVector, ControlModel

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token_id = 0

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
    model = model.to("cuda:0" if torch.cuda.is_available()
            else "mps:0" if torch.backends.mps.is_available()
            else "cpu")
    model = ControlModel(model, list(range(-5, -18, -1)))

    control_vector = ControlVector(model.config.model_type, directions)

    user_tag, asst_tag = "[INST]", "[/INST]"

    # the question to ask the modified model
    # don't forget the space after {user_tag} and before {asst_tag}!
    input = f"{user_tag} What are human beings like? {asst_tag}"

    # tokenizer and generation settings
    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id, # silence warning
        "do_sample": False, # temperature=0
        "max_new_tokens": 128,
        "repetition_penalty": 1.1, # reduce control jank
    }

    print("==baseline")
    model.reset()
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

    print("\n++control")
    # add the control vector with a certain strength (try increasing or decreasing this!)
    model.set_control(control_vector, 1.0)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

    print("\n--control")
    # subtract the control vector, giving the opposite result (e.g. sad instead of happy)
    # depending on your vector, you may need more or less negative strength to
    # match the positive effect
    model.set_control(control_vector, -1.0)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
    model.reset()


print("\nLoad hidden shit\n")
hidden_states = load_hidden_states('control_vector_data.gguf')
print("\nHidden shit loaded\n")
directions = read_representations(hidden_states)
print("\nExport this motherfucker\n")
export_gguf(directions, 'control_vector.gguf')

TEST_MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.1'
#test_model(TEST_MODEL_NAME, directions)