trollkotze's picture
Patches for llama.cpp and repeng to run this stuff.
6cd1d54 verified
import gguf
import numpy as np
from sklearn.decomposition import PCA
import tqdm
def load_hidden_states(path):
print("\nyour mom\n")
'''Load hidden states produced by the llama.cpp ./repeng tool.'''
gguf_file = gguf.GGUFReader(path)
print("\nyour dad\n")
hidden_states = {}
for t in gguf_file.tensors:
if not t.name.startswith('l_out-'):
continue
layer = int(t.name[len('l_out-'):])
assert layer not in hidden_states, 'duplicate hidden states for layer %d' % layer
data = t.data.reshape((t.shape[1], t.shape[0]))
hidden_states[layer] = data
return hidden_states
def project_onto_direction(H, direction):
"""Project matrix H (n, d_1) onto direction vector (d_2,)"""
mag = np.linalg.norm(direction)
assert not np.isinf(mag)
return (H @ direction) / mag
def read_representations(
layer_hiddens: dict[int, np.ndarray],
) -> dict[int, np.ndarray]:
"""
Extract the representations based on the contrast dataset.
"""
hidden_layers = sorted(layer_hiddens.keys())
num_inputs = next(iter(layer_hiddens.values())).shape[0] // 2
print('%d inputs' % num_inputs)
# get differences between (positive, negative) pairs
relative_layer_hiddens = {}
for layer in hidden_layers:
relative_layer_hiddens[layer] = (
layer_hiddens[layer][::2] - layer_hiddens[layer][1::2]
)
# get directions for each layer using PCA
directions: dict[int, np.ndarray] = {}
for layer in tqdm.tqdm(hidden_layers):
assert layer_hiddens[layer].shape[0] == num_inputs * 2
# fit layer directions
train = np.vstack(
relative_layer_hiddens[layer]
- relative_layer_hiddens[layer].mean(axis=0, keepdims=True)
)
pca_model = PCA(n_components=1, whiten=False).fit(train)
# shape (n_features,)
directions[layer] = pca_model.components_.astype(np.float32).squeeze(axis=0)
# calculate sign
projected_hiddens = project_onto_direction(
layer_hiddens[layer], directions[layer]
)
# order is [positive, negative, positive, negative, ...]
positive_smaller_mean = np.mean(
[
projected_hiddens[i] < projected_hiddens[i + 1]
for i in range(0, num_inputs * 2, 2)
]
)
positive_larger_mean = np.mean(
[
projected_hiddens[i] > projected_hiddens[i + 1]
for i in range(0, num_inputs * 2, 2)
]
)
if positive_smaller_mean > positive_larger_mean: # type: ignore
directions[layer] *= -1
return directions
def export_gguf(directions, path: str):
"""
Export a trained ControlVector to a llama.cpp .gguf file.
"""
arch = "controlvector"
writer = gguf.GGUFWriter(path, arch)
#writer.add_string(f"{arch}.model_hint", model_type)
#writer.add_uint32(f"{arch}.layer_count", len(directions))
for layer in directions.keys():
if layer == 0:
# For some reason, llama.cpp bails out if it sees a direction.0
# tensor.
continue
writer.add_tensor(f"direction.{layer}", directions[layer])
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_tensors_to_file()
writer.close()
def test_model(model_name, directions):
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from repeng import ControlVector, ControlModel
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available()
else "mps:0" if torch.backends.mps.is_available()
else "cpu")
model = ControlModel(model, list(range(-5, -18, -1)))
control_vector = ControlVector(model.config.model_type, directions)
user_tag, asst_tag = "[INST]", "[/INST]"
# the question to ask the modified model
# don't forget the space after {user_tag} and before {asst_tag}!
input = f"{user_tag} What are human beings like? {asst_tag}"
# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)
settings = {
"pad_token_id": tokenizer.eos_token_id, # silence warning
"do_sample": False, # temperature=0
"max_new_tokens": 128,
"repetition_penalty": 1.1, # reduce control jank
}
print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 1.0)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to
# match the positive effect
model.set_control(control_vector, -1.0)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()
print("\nLoad hidden shit\n")
hidden_states = load_hidden_states('control_vector_data.gguf')
print("\nHidden shit loaded\n")
directions = read_representations(hidden_states)
print("\nExport this motherfucker\n")
export_gguf(directions, 'control_vector.gguf')
TEST_MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.1'
#test_model(TEST_MODEL_NAME, directions)