trollkotze's picture
Patches for llama.cpp and repeng to run this stuff.
6cd1d54 verified
raw
history blame
No virus
5.67 kB
import gguf
import numpy as np
from sklearn.decomposition import PCA
import tqdm
def load_hidden_states(path):
print("\nyour mom\n")
'''Load hidden states produced by the llama.cpp ./repeng tool.'''
gguf_file = gguf.GGUFReader(path)
print("\nyour dad\n")
hidden_states = {}
for t in gguf_file.tensors:
if not t.name.startswith('l_out-'):
continue
layer = int(t.name[len('l_out-'):])
assert layer not in hidden_states, 'duplicate hidden states for layer %d' % layer
data = t.data.reshape((t.shape[1], t.shape[0]))
hidden_states[layer] = data
return hidden_states
def project_onto_direction(H, direction):
"""Project matrix H (n, d_1) onto direction vector (d_2,)"""
mag = np.linalg.norm(direction)
assert not np.isinf(mag)
return (H @ direction) / mag
def read_representations(
layer_hiddens: dict[int, np.ndarray],
) -> dict[int, np.ndarray]:
"""
Extract the representations based on the contrast dataset.
"""
hidden_layers = sorted(layer_hiddens.keys())
num_inputs = next(iter(layer_hiddens.values())).shape[0] // 2
print('%d inputs' % num_inputs)
# get differences between (positive, negative) pairs
relative_layer_hiddens = {}
for layer in hidden_layers:
relative_layer_hiddens[layer] = (
layer_hiddens[layer][::2] - layer_hiddens[layer][1::2]
)
# get directions for each layer using PCA
directions: dict[int, np.ndarray] = {}
for layer in tqdm.tqdm(hidden_layers):
assert layer_hiddens[layer].shape[0] == num_inputs * 2
# fit layer directions
train = np.vstack(
relative_layer_hiddens[layer]
- relative_layer_hiddens[layer].mean(axis=0, keepdims=True)
)
pca_model = PCA(n_components=1, whiten=False).fit(train)
# shape (n_features,)
directions[layer] = pca_model.components_.astype(np.float32).squeeze(axis=0)
# calculate sign
projected_hiddens = project_onto_direction(
layer_hiddens[layer], directions[layer]
)
# order is [positive, negative, positive, negative, ...]
positive_smaller_mean = np.mean(
[
projected_hiddens[i] < projected_hiddens[i + 1]
for i in range(0, num_inputs * 2, 2)
]
)
positive_larger_mean = np.mean(
[
projected_hiddens[i] > projected_hiddens[i + 1]
for i in range(0, num_inputs * 2, 2)
]
)
if positive_smaller_mean > positive_larger_mean: # type: ignore
directions[layer] *= -1
return directions
def export_gguf(directions, path: str):
"""
Export a trained ControlVector to a llama.cpp .gguf file.
"""
arch = "controlvector"
writer = gguf.GGUFWriter(path, arch)
#writer.add_string(f"{arch}.model_hint", model_type)
#writer.add_uint32(f"{arch}.layer_count", len(directions))
for layer in directions.keys():
if layer == 0:
# For some reason, llama.cpp bails out if it sees a direction.0
# tensor.
continue
writer.add_tensor(f"direction.{layer}", directions[layer])
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_tensors_to_file()
writer.close()
def test_model(model_name, directions):
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from repeng import ControlVector, ControlModel
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available()
else "mps:0" if torch.backends.mps.is_available()
else "cpu")
model = ControlModel(model, list(range(-5, -18, -1)))
control_vector = ControlVector(model.config.model_type, directions)
user_tag, asst_tag = "[INST]", "[/INST]"
# the question to ask the modified model
# don't forget the space after {user_tag} and before {asst_tag}!
input = f"{user_tag} What are human beings like? {asst_tag}"
# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)
settings = {
"pad_token_id": tokenizer.eos_token_id, # silence warning
"do_sample": False, # temperature=0
"max_new_tokens": 128,
"repetition_penalty": 1.1, # reduce control jank
}
print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 1.0)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to
# match the positive effect
model.set_control(control_vector, -1.0)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()
print("\nLoad hidden shit\n")
hidden_states = load_hidden_states('control_vector_data.gguf')
print("\nHidden shit loaded\n")
directions = read_representations(hidden_states)
print("\nExport this motherfucker\n")
export_gguf(directions, 'control_vector.gguf')
TEST_MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.1'
#test_model(TEST_MODEL_NAME, directions)