{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30761,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# LLM Abliterate v1.4 script, adapted for Mistral-Small-Instruct-2409\n\nAuthor: byroneverson\n\nThis script ran at kaggle.com, accelerator: None, persistence: Files only","metadata":{}},{"cell_type":"markdown","source":"# Download quantized model locally\n\nDownloading to /kaggle/temp dir so will need to keep the session open to keep model local","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n\nfrom huggingface_hub import hf_hub_download\n\nhf_hub_download(repo_id=\"bartowski/Mistral-Small-Instruct-2409-GGUF\", filename=\"Mistral-Small-Instruct-2409-Q4_K_M.gguf\", local_dir=\"/kaggle/temp\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Temporary: We need my fork of ggml-python because the official abetlen is out of date and ggml_tensor is incompatible at the moment.","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n!git clone --recurse-submodules https://github.com/byroneverson/ggml-python.git","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Install my ggml-python, llama-cpp-python, and couple other reqs\n\nUsing CUDA with llama.cpp for these larger models, you can install the normal llama-cpp-python if you want CPU only","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n\n!pip install ./ggml-python\n!pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu123 #CUDA\n#!pip install llama-cpp-python\n!pip install jaxtyping\n!pip install einops","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Obtain layer output embeddings for each of our sample instruction sets (harmful and harmless)\n\n- These will be saved to the folders \"harmful_states\" and \"harmless_states\".\n- Each output file contains a tensor of shape (n_layers * 2 + 1, n_embd).\n- Output files will contain embeds for the token embeddings, and both the attn output and ffn output embeds of each layer.\n- A quant model is used for this process with llama.cpp for minimal cpu and memory usage.\n- This version of the script uses CUDA to split the layers workload between CPU and GPU.\n- Start with too many GPU layers and decrease until you no longer have allocate errors to use as many GPU layers as possible.\n- Considering we will end up using the mean of these samples, the amount of quantization shouldn't matter much.","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n\nimport ctypes\nimport os\nimport multiprocessing\nimport random\nimport gc\nimport sys\nimport re\nimport llama_cpp\nimport ggml\nimport torch\nfrom math import prod\nfrom datasets import load_dataset\nfrom tqdm import tqdm\n\n# Number of total layers in your model\nn_layers = 56\n# GPU layers, use as many as possible, ideally all the layers and also the input embedding\nn_gpu_layers = n_layers + 1 #0 # CPU only\n\n# Number of instructions to average for our feature estimation (e.g. 512 for harmful and 512 for harmless)\ninstructions = 512 #256 #32\n\n# Our local gguf model\n# TODO: Load model with only num_layers we actually need for this step\nworking_dir = \"/kaggle/working\"\nlocal_repo_dir = \"/kaggle/temp\"\nmodel_path = local_repo_dir + \"/\" + \"Mistral-Small-Instruct-2409-Q4_K_M.gguf\"\nchat_template = ctypes.c_char_p(\"mistral\".encode(\"utf-8\")) #None\n\n# Init llama backend\nllama_cpp.llama_backend_init(numa=False)\n\n# llama.cpp custom model code\n\ndef c_array_to_tensor(pointer, shape, torch_type):\n arr = (pointer._type_ * prod(shape)).from_address(\n ctypes.addressof(pointer.contents))\n return torch.frombuffer(arr, dtype=torch_type).view(*shape)\n\ndef model_load(model_path):\n # TODO: Attempt to hook num_layers\n model_params = llama_cpp.llama_model_default_params()\n model_params.n_gpu_layers = n_gpu_layers\n model_params.use_mmap = True\n model = llama_cpp.llama_load_model_from_file(model_path.encode(\"utf-8\"), model_params)\n return model\n\ndef model_free(model):\n llama_cpp.llama_free(model)\n \ndef model_tokenize_chat(model, role, content, add_assistant=True):\n role = role.encode(\"utf-8\")\n content = content.encode(\"utf-8\")\n content_len = len(content)\n if content_len == 0:\n return []\n \n chat_message = llama_cpp.llama_chat_message(role=role, content=content)\n buffer_length = content_len * 4 #2\n buffer = ctypes.create_string_buffer(buffer_length)\n result = llama_cpp.llama_chat_apply_template(model, chat_template, ctypes.pointer(chat_message), 1, add_assistant, buffer, ctypes.c_int32(buffer_length))\n if result <= 0:\n print(f\"Warning: model_tokenize_chat returned {result} for \\\"{content}\\\"\")\n elif result >= buffer_length:\n buffer_length = result + 1\n buffer = ctypes.create_string_buffer(buffer_length)\n result = llama_cpp.llama_chat_apply_template(model, chat_template, ctypes.pointer(chat_message), 1, add_assistant, buffer, ctypes.c_int32(buffer_length))\n content = buffer.value if result > 0 else content\n \n # Add space for llama only, check model params for add space var\n add_space = False # TODO: Check model/config for this\n if add_space:\n content = b\" \" + content\n \n # Tokenize\n content_len = len(content)\n content_len_c = ctypes.c_int32(content_len)\n tokens = (ctypes.c_int32 * content_len)()\n count = llama_cpp.llama_tokenize(model, content, content_len_c, tokens, content_len_c, True, True)\n if content_len > count:\n tokens = tokens[:count]\n return tokens\n\ndef print_tensor_info(t_ptr):\n #: contiguous: {ggml.ggml_is_contiguous(t)}, permuted: {ggml.ggml_is_permuted(t)}, transposed: {ggml.ggml_is_transposed(t)}\"\n t = t_ptr.contents\n print(f\"{ggml.ggml_type_name(t.type)} {ggml.ggml_op_desc(t_ptr)} {t.name}\")\n print(f\" n_elements = {ggml.ggml_nelements(t)}\")\n print(f\" ne = ({t.ne[0]}, {t.ne[1]}, {t.ne[2]}, {t.ne[3]})\")\n print(f\" nb = ({t.nb[0]}, {t.nb[1]}, {t.nb[2]}, {t.nb[3]})\")\n is_host = ggml.ggml_backend_buffer_is_host(t.buffer)\n print(f\" is_host = {is_host}\")\n print(f\" buffer = {t.buffer}\")\n print(f\" data = {t.data}\")\n if ctypes.c_void_p.from_buffer(t.src[0]).value != None:\n print(f\" src[0] = {ggml.ggml_op_desc(t.src[0])}\")\n if ctypes.c_void_p.from_buffer(t.src[1]).value != None:\n print(f\" src[1] = {ggml.ggml_op_desc(t.src[1])}\")\n\n# Callback will fill this during model inference\nclass CallbackDataStruct(ctypes.Structure):\n _fields_ = [\n (\"layer\", ctypes.c_int),\n (\"type\", ctypes.c_int),\n (\"buffer\", ctypes.POINTER(ctypes.c_float))\n ]\n\ncallback_data = CallbackDataStruct()\ncallback_data.layer = 0\ncallback_data.type = 0\n\ndef hidden_states_eval_callback(t_void_p, ask, user_data):\n cb_data_ptr = ctypes.cast(user_data, ctypes.POINTER(CallbackDataStruct))\n cb_data = cb_data_ptr.contents\n t_ptr = ctypes.cast(t_void_p, ctypes.POINTER(ggml.ggml_tensor))\n t = t_ptr.contents\n if ask:\n name = t.name.decode(\"utf-8\")\n match = re.match(r\"inp_embd\", name)\n if match:\n cb_data.type = 0\n cb_data.layer = 0\n return True\n match = re.match(r\"kqv_out-(\\d+)\", name)\n if match:\n cb_data.type = 1\n cb_data.layer = int(match.group(1))\n return True\n match = re.match(r\"ffn_out-(\\d+)\", name)\n if match:\n cb_data.type = 2\n cb_data.layer = int(match.group(1))\n return True\n return False\n else:\n offset = cb_data.layer if (cb_data.type == 0) else (cb_data.layer * 2 + 1) if (cb_data.type == 1) else (cb_data.layer * 2 + 2)\n data = ctypes.cast(t_ptr.contents.data, ctypes.POINTER(ctypes.c_float))\n dst_ptr = ctypes.c_void_p(ctypes.addressof(cb_data.buffer.contents) + (offset * t.ne[0]) * 4)\n ggml.ggml_backend_tensor_get(t_ptr, dst_ptr, (t.ne[0] * (t.ne[1]-1)) * 4, t.ne[0] * 4)\n # Returning false stops graph in it's tracks without error\n return True\n # return True to request data next callback, false to skip, ask will be False when returning data from a request\n return False\n\nc_hidden_states_eval_callback = ctypes.CFUNCTYPE(\n ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p\n)(hidden_states_eval_callback)\n\ndef model_generate_hidden_states(model, toks, buffer):\n # Set callback vars\n callback_data.buffer = buffer\n # Clear cache per sample instruction\n llama_cpp.llama_kv_cache_clear(context)\n # Token count\n n_tokens = len(toks)\n # Fill batch\n batch.n_tokens = n_tokens\n for i in range(n_tokens):\n batch.token[i] = toks[i]\n batch.pos[i] = i\n batch.seq_id[i][0] = 0\n batch.n_seq_id[i] = 1\n batch.logits[i] = False\n batch.logits[n_tokens - 1] = True\n # Decode batch\n result = llama_cpp.llama_decode(context, batch)\n if result == 1:\n print(\"decode warning\")\n elif result < 0:\n print(\"decode error\")\n\n# Clear memory of past model usage\nmodel = None\ngc.collect()\n\n# Load model\nmodel = model_load(model_path)\nn_embd = llama_cpp.llama_n_embd(model)\n\n# Tokenize instructions\nprint(\"Instruction count: \" + str(instructions))\n\ndataset = load_dataset(\"byroneverson/abliterate-refusal\", split=\"train\")\n\n# Filter the dataset based on 'target'\nharmful_dataset = dataset.filter(lambda x: x['target'] == True)\nharmless_dataset = dataset.filter(lambda x: x['target'] == False)\n\ndataset = None\ngc.collect()\n\n# Randomly select 512 entries from each filtered dataset\nharmful_instructions = random.sample(harmful_dataset['prompt'], instructions)\nharmless_instructions = random.sample(harmless_dataset['prompt'], instructions)\n\nharmful_dataset = None\nharmless_dataset = None\ngc.collect()\n\nharmful_toks = [model_tokenize_chat(model, role=\"user\", content=i, add_assistant=True) for i in harmful_instructions]\nharmless_toks = [model_tokenize_chat(model, role=\"user\", content=i, add_assistant=True) for i in harmless_instructions]\n\nharmful_instructions = None\nharmless_instructions = None\ngc.collect()\n\n# Create context and batch\ndef context_create(model, cb_eval, cb_eval_user_data):\n context_params = llama_cpp.llama_context_default_params()\n n_threads = multiprocessing.cpu_count()\n context_params.n_threads = n_threads\n context_params.n_threads_batch = n_threads\n context_params.seed = 1337\n context_params.cb_eval = cb_eval\n context_params.cb_eval_user_data = ctypes.cast(ctypes.pointer(cb_eval_user_data), ctypes.c_void_p)\n context = llama_cpp.llama_new_context_with_model(model, context_params)\n batch = llama_cpp.llama_batch_init(context_params.n_batch, 0, context_params.n_ctx)\n return (context, batch)\n\ncontext, batch = context_create(model, c_hidden_states_eval_callback, callback_data)\n\n# Create ctypes float buffer\nembedding_count = n_layers * 2 + 1\nbuffer_size = embedding_count * llama_cpp.llama_n_embd(model) * 4\nbuffer = ctypes.cast(ctypes.create_string_buffer(buffer_size), ctypes.POINTER(ctypes.c_float))\n\nimport time\nsys.stdout.flush()\ntime.sleep(5) # Let model finish printing before start\nsys.stdout.flush()\n\n# Progress bar\nmax_its = instructions * 2\nbar = tqdm(total=max_its)\n\n# Generate target layer hidden state files for harmful and harmless features and save to file\ndef save_target_hidden_states(toks, index, feature, buffer):\n bar.update(n=1)\n model_generate_hidden_states(model, toks, buffer)\n # Convert float buffer to torch array for easy handling\n tensor = c_array_to_tensor(buffer, (embedding_count, n_embd), torch.float32)\n # Save tensor\n dir_path = working_dir + \"/\" + feature + \"_states\"\n file_path = dir_path + \"/\" + str(index) + \".pt\"\n if not os.path.exists(dir_path):\n os.makedirs(dir_path)\n torch.save(tensor, file_path)\n\n# Save harmful states\nfor index, toks in enumerate(harmful_toks):\n save_target_hidden_states(toks, index, \"harmful\", buffer)\n\n# Save harmless states\nfor index, toks in enumerate(harmless_toks):\n save_target_hidden_states(toks, index, \"harmless\", buffer)\n\n# End progress bar\nbar.close()\n\n# Free batch, model, context, and backend\nllama_cpp.llama_batch_free(batch)\nllama_cpp.llama_free(context)\nllama_cpp.llama_free_model(model)\nllama_cpp.llama_backend_free()\n\n# Clean-up\nharmful_toks = None\nharmless_toks = None\ncallback_data = None\nbuffer = None\nbatch = None\nmodel = None\ncontext = None\ngc.collect()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Get refusal direction vector using my PCA (Primary Component Analysis) algorithm and save\n\nThis process is constantly evolving and attempting to write a decent description seems a little pointless.\n\nIn general, some form of PCA analysis is performed on the layers with the intent of seeing where in the model there is a large difference between harmful and harmless.\n\nI basically with patching layers/tensors that have the most separation in the PCA analysis and keep adding layers until I have a good abliteration.\n\nAnother trick I used is increasing the \"amount\". 1.0 would simply remove the feature, but a value of 2.0 would effectively add the opposite of the feature. The opposite of refusal being something like submission. Extreme amounts like 5.0 - 10.0 will also increase intelligence degredation. 2.0 seems like a good amount, 6.0 helped with some prompts but interestingly enough, re-introduced refusal for others.\n\nBased loosely on methods described here: https://www.lesswrong.com/posts/jGuXSZgv6qfdhMCuJ/refusal-in-llms-is-mediated-by-a-single-direction","metadata":{}},{"cell_type":"code","source":"%cd /kaggle/working\n\nimport torch\nimport math\nimport os\nimport gc\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition import PCA\n\nfrom bokeh.io import output_notebook\nfrom bokeh.layouts import row, column\nfrom bokeh.plotting import figure, show\nfrom bokeh.models import ColumnDataSource, Slider, CustomJS\noutput_notebook()\n\nworking_dir = \"/kaggle/working\"\nlocal_repo_dir = working_dir + \"/\" + \"Mistral-Small-Instruct-2409\"\nn_layers = 56\ninstructions = 512 #256 #32\nn_components = 16 # PCA components\n\n\ngc.collect()\n\n# Load tensors\nharmful_data = torch.stack([torch.load(f\"{working_dir}/harmful_states/{i}.pt\", weights_only=True) for i in range(instructions)])\nharmless_data = torch.stack([torch.load(f\"{working_dir}/harmless_states/{i}.pt\", weights_only=True) for i in range(instructions)])\n\n# Split by tensor type\nharmful_tok = harmful_data[:, 0, :]\nharmless_tok = harmless_data[:, 0, :]\nharmful_attn = torch.cat((harmful_data[:, 0, :].unsqueeze(1), harmful_data[:, 1::2, :]), dim=1)\nharmless_attn = torch.cat((harmless_data[:, 0, :].unsqueeze(1), harmless_data[:, 1::2, :]), dim=1)\nharmful_ffn = harmful_data[:, ::2, :]\nharmless_ffn = harmless_data[:, ::2, :]\n\n# Instructions mean\n#harmful_mean = harmful_data.mean(dim=0)\n#harmless_mean = harmless_data.mean(dim=0)\nharmful_mean_tok = harmful_tok.mean(dim=0)\nharmless_mean_tok = harmless_tok.mean(dim=0)\nharmful_mean_attn = harmful_attn.mean(dim=0)\nharmless_mean_attn = harmless_attn.mean(dim=0)\nharmful_mean_ffn = harmful_ffn.mean(dim=0)\nharmless_mean_ffn = harmless_ffn.mean(dim=0)\n\n# Feature mean diff\n#mean_diff = harmful_mean - harmless_mean\nmean_diff_tok = harmful_mean_tok - harmless_mean_tok\nmean_diff_attn = harmful_mean_attn - harmless_mean_attn\nmean_diff_ffn = harmful_mean_ffn - harmless_mean_ffn\n\ndef get_pca_plot(harmful_data, harmless_data, mean_diff, col_1, col_2):\n pca = PCA(n_components=64)\n layer_harmful = harmful_data[:, n_layers // 2, :]\n layer_harmless = harmless_data[:, n_layers // 2, :]\n pca.fit(torch.cat((layer_harmful, layer_harmless), 0))\n mean_diff_pca = torch.tensor(pca.transform(mean_diff))\n colors = [col_1, col_2] * (n_layers // 2)\n \n def get_pca_sub_plot(title, vals):\n mean_diff_source = ColumnDataSource(data={'x': [x for x in range(1, n_layers+1)], 'y': vals, 'c': colors})\n p = figure(width=600, height=200, title=title, x_range=(0, n_layers+1), y_range=(-1, 1))\n p.vbar(x='x', top='y', source=mean_diff_source, width=0.8, fill_color='c')\n p.xgrid.grid_line_color = None\n return p\n\n plot_prog = get_pca_sub_plot(\"mean diff progressive cos-sim\", torch.cosine_similarity(mean_diff_pca[0:-1,:], mean_diff_pca[1:,:], dim=1))\n plot_mean = get_pca_sub_plot(\"mean diff mean cos-sim\", torch.cosine_similarity(mean_diff_pca[1:,:], mean_diff_pca.mean(dim=0, keepdim=True), dim=1))\n plot_start = get_pca_sub_plot(\"mean diff start cos-sim\", torch.cosine_similarity(mean_diff_pca[1:,:], mean_diff_pca[1,:].unsqueeze(0), dim=1))\n plot_end = get_pca_sub_plot(\"mean diff end cos-sim\", torch.cosine_similarity(mean_diff_pca[1:,:], mean_diff_pca[-1,:].unsqueeze(0), dim=1)) \n return column(plot_prog, plot_mean, plot_start, plot_end)\n\n# Show PCA of Attention\nplot_attn = get_pca_plot(harmful_attn, harmless_attn, mean_diff_attn, \"orange\", \"red\")\n# Show PCA of FFN\nplot_ffn = get_pca_plot(harmful_ffn, harmless_ffn, mean_diff_ffn, \"cyan\", \"blue\")\n# Show layout\nshow(row(plot_attn, plot_ffn))\n\n\n'''\n\n# New algorithm - \"Oversampled PCA Reduction\"\n# TODO: Recycle z-score thresholding from previous scripts to trim outliers from samples (75% coverage) (64 -> 48 samples per part)\nfig, ax = plt.subplots(nrows=4, ncols=3, figsize=(16, 3 * 4))\n\nlayer_scores = []\n\n'''\n\n# PCA\npca = PCA(n_components=n_components)\n\n'''\n\n#pc_harmful_inp = []\n#pc_harmless_inp = []\npc_attn = []\n#pc_harmless_attn = []\npc_ffn = []\n#pc_harmless_ffn = []\n\n#pca.fit(harmful_inp[:, :])\n#pc_harmful_inp.append(torch.tensor(pca.components_))\n#pca.fit(harmless_inp[:, :])\n#pc_harmless_inp.append(torch.tensor(pca.components_))\n\n'''\n\ncomp_harmful_attn = []\ncomp_harmless_attn = []\ncomp_harmful_ffn = []\ncomp_harmless_ffn = []\n\nlayer_harmful_attn = harmful_attn[:, n_layers // 2, :]\nlayer_harmless_attn = harmless_attn[:, n_layers // 2, :]\npca.fit(torch.cat((layer_harmful_attn, layer_harmless_attn), 0))\nfor layer in range(n_layers):\n layer_harmful_attn = harmful_attn[:, layer, :]\n layer_harmless_attn = harmless_attn[:, layer, :]\n #pca.fit(torch.cat((layer_harmful_attn, layer_harmless_attn), 0))\n #pc_attn.append(torch.tensor(pca.components_))\n comp_harmful_attn.append(torch.tensor(pca.transform(layer_harmful_attn)))\n comp_harmless_attn.append(torch.tensor(pca.transform(layer_harmless_attn)))\n #pca.fit(harmless_attn[:, layer, :])\n #pc_harmless_attn.append(torch.tensor(pca.components_))\n \nlayer_harmful_ffn = harmful_ffn[:, n_layers // 2, :]\nlayer_harmless_ffn = harmless_ffn[:, n_layers // 2, :]\npca.fit(torch.cat((layer_harmful_ffn, layer_harmless_ffn), 0))\nfor layer in range(n_layers):\n layer_harmful_ffn = harmful_ffn[:, layer, :]\n layer_harmless_ffn = harmless_ffn[:, layer, :]\n comp_harmful_ffn.append(torch.tensor(pca.transform(layer_harmful_ffn)))\n comp_harmless_ffn.append(torch.tensor(pca.transform(layer_harmless_ffn)))\n \n# Stack layers to tensors\ncomp_harmful_attn = torch.stack(comp_harmful_attn, dim=1)\ncomp_harmless_attn = torch.stack(comp_harmless_attn, dim=1)\ncomp_harmful_ffn = torch.stack(comp_harmful_ffn, dim=1)\ncomp_harmless_ffn = torch.stack(comp_harmless_ffn, dim=1)\n\n'''\n\nlayer_harmful_ffn = harmful_ffn[:, n_layers // 2, :]\nlayer_harmless_ffn = harmless_ffn[:, n_layers // 2, :]\npca.fit(torch.cat((layer_harmful_ffn, layer_harmless_ffn), 0))\nfor layer in range(n_layers):\n pc_ffn.append(torch.tensor(pca.components_))\n #pca.fit(harmless_ffn[:, layer, :])\n #pc_harmless_ffn.append(torch.tensor(pca.components_))\n\n# Attn\n# Cosine similarity of mean_diff and PCs\npc_attn = torch.stack(pc_attn, dim=1)\ncos_sim_abs = torch.cosine_similarity(mean_attn_diff.unsqueeze(0), pc_attn, dim=-1).abs()\nax[0, 1].imshow(cos_sim_abs, cmap='BuGn', interpolation='nearest', vmin=0.0, vmax=1.0)\nax[0, 1].set_title(\"Cos-Sim Attn (Mean-Diff & PCs)\")\n# Get PC that correlates the most with mean_diff\ncos_sim_abs_pc_sum = cos_sim_abs.sum(dim=-1)\ncos_sim_abs_pc_sum_max = cos_sim_abs_pc_sum.argmax()\ncolors = [\"blue\"] * cos_sim_abs_pc_sum.shape[0]\ncolors[cos_sim_abs_pc_sum_max] = \"green\"\nax[1, 1].bar(x=range(cos_sim_abs_pc_sum.shape[0]), height=cos_sim_abs_pc_sum, color=colors)\nax[1, 1].set_title(\"Cos-Sim (Abs-Sum of PCs)\")\n# Get layer that correlates the most with mean_diff\ncos_sim_abs_layer_sum = cos_sim_abs.sum(dim=0)\ncos_sim_abs_layer_sum_max = cos_sim_abs_layer_sum.argmax()\ncolors = [\"blue\"] * cos_sim_abs_layer_sum.shape[0]\ncolors[cos_sim_abs_layer_sum_max] = \"green\"\nax[2, 1].bar(x=range(1, cos_sim_abs_layer_sum.shape[0] + 1), height=cos_sim_abs_layer_sum, color=colors)\nax[2, 1].set_title(\"Cos-Sim (Abs-Sum of Layers)\")\n#layer_scores.append(cos_sim_abs_layer_sum)\n\n# FFN\n# Cosine similarity of mean_diff and PCs\npc_ffn = torch.stack(pc_ffn, dim=1)\ncos_sim_abs = torch.cosine_similarity(mean_ffn_diff.unsqueeze(0), pc_ffn, dim=-1).abs()\nax[0, 2].imshow(cos_sim_abs, cmap='BuGn', interpolation='nearest', vmin=0.0, vmax=1.0)\nax[0, 2].set_title(\"Cos-Sim FFN (Mean-Diff & PCs)\")\n# Get PC that correlates the most with mean_diff\ncos_sim_abs_pc_sum = cos_sim_abs.sum(dim=-1)\ncos_sim_abs_pc_sum_max = cos_sim_abs_pc_sum.argmax()\ncolors = [\"blue\"] * cos_sim_abs_pc_sum.shape[0]\ncolors[cos_sim_abs_pc_sum_max] = \"green\"\nax[1, 2].bar(x=range(cos_sim_abs_pc_sum.shape[0]), height=cos_sim_abs_pc_sum, color=colors)\nax[1, 2].set_title(\"Cos-Sim (Abs-Sum of PCs)\")\n# Get layer that correlates the most with mean_diff\ncos_sim_abs_layer_sum = cos_sim_abs.sum(dim=0)\ncos_sim_abs_layer_sum_max = cos_sim_abs_layer_sum.argmax()\ncolors = [\"blue\"] * cos_sim_abs_layer_sum.shape[0]\ncolors[cos_sim_abs_layer_sum_max] = \"green\"\nax[2, 2].bar(x=range(1, cos_sim_abs_layer_sum.shape[0] + 1), height=cos_sim_abs_layer_sum, color=colors)\nax[2, 2].set_title(\"Cos-Sim (Abs-Sum of Layers)\")\n#layer_scores.append(cos_sim_abs_layer_sum)\n \n# Show partials plot\nplt.tight_layout()\nplt.show()\n\n'''\n\n# Show a PCA plot for PC 1 and 2, interpolates between layers\nclass PCAPlot:\n def __init__(self, title, width, height, comp_1, comp_2):\n # Group by layers via transpose so we can just set the offset for updating\n self.comp_1 = ColumnDataSource(data={'x': comp_1[:,:,0].transpose(0,1), 'y': comp_1[:,:,1].transpose(0,1)})\n self.comp_2 = ColumnDataSource(data={'x': comp_2[:,:,0].transpose(0,1), 'y': comp_2[:,:,1].transpose(0,1)})\n self.source_1 = ColumnDataSource(data={'x': self.comp_1.data['x'][0 * instructions : 1 * instructions], 'y': self.comp_1.data['y'][0 * instructions : 1 * instructions]})\n self.source_2 = ColumnDataSource(data={'x': self.comp_2.data['x'][0 * instructions : 1 * instructions], 'y': self.comp_2.data['y'][0 * instructions : 1 * instructions]})\n # Create the Bokeh figure\n self.plot = figure(title=title, width=width, height=height, x_axis_label='PC 1', y_axis_label='PC 2')\n self.plot.scatter('x', 'y', source=self.source_1, size=4, color='red', alpha=0.5)\n self.plot.scatter('x', 'y', source=self.source_2, size=4, color='blue', alpha=0.5)\n # Callback\n self.callback = CustomJS(args={'s1': self.source_1, 's2': self.source_2, 'c1': self.comp_1, 'c2': self.comp_2, 'n_layers': n_layers, 'n_ins': instructions}, code=\"\"\"\n const val = cb_obj.value;\n if (1 <= val && val < n_layers) {\n const val_floor = Math.floor(val);\n const f = val - val_floor;\n const i = val_floor - 1;\n const x_f_1 = c1.data['x'].slice(i*n_ins,(i+1)*n_ins).map((element, index) => element*(1-f)+c1.data['x'].slice((i+1)*n_ins,(i+2)*n_ins)[index]*f);\n const y_f_1 = c1.data['y'].slice(i*n_ins,(i+1)*n_ins).map((element, index) => element*(1-f)+c1.data['y'].slice((i+1)*n_ins,(i+2)*n_ins)[index]*f);\n s1.data = {'x': x_f_1, 'y': y_f_1};\n const x_f_2 = c2.data['x'].slice(i*n_ins,(i+1)*n_ins).map((element, index) => element*(1-f)+c2.data['x'].slice((i+1)*n_ins,(i+2)*n_ins)[index]*f);\n const y_f_2 = c2.data['y'].slice(i*n_ins,(i+1)*n_ins).map((element, index) => element*(1-f)+c2.data['y'].slice((i+1)*n_ins,(i+2)*n_ins)[index]*f);\n s2.data = {'x': x_f_2, 'y': y_f_2};\n } else {\n s1.data = {'x': c1.data['x'].slice((n_layers-1)*n_ins,n_layers*n_ins), 'y': c1.data['y'].slice((n_layers-1)*n_ins,n_layers*n_ins)};\n s2.data = {'x': c2.data['x'].slice((n_layers-1)*n_ins,n_layers*n_ins), 'y': c2.data['y'].slice((n_layers-1)*n_ins,n_layers*n_ins)};\n }\n s1.change.emit();\n s2.change.emit();\n \"\"\")\n # Create slider and show\n self.slider = Slider(start=1, end=n_layers, value=1, step=0.1, title=\"Layer\", width=width)\n self.slider.js_on_change('value', self.callback)\n\n def layout(self):\n return column(self.plot, self.slider)\n\n# Show Attn PCA\nplot_attn = PCAPlot(\"PCA: Attention\", 600, 600, comp_harmful_attn, comp_harmless_attn)\nplot_ffn = PCAPlot(\"PCA: FFN\", 600, 600, comp_harmful_ffn, comp_harmless_ffn)\nshow(row(plot_attn.layout(), plot_ffn.layout())) #show(row(plot_attn.layout(), plot_ffn.layout()))\n\ncomp_harmful_attn = None\ncomp_harmless_attn = None\ncomp_harmful_ffn = None\ncomp_harmless_ffn = None\ngc.collect()\n\n'''\n\n# Scores\n#layer_scores = torch.stack(layer_scores, dim=0)\n#scores = layer_scores.amax(dim=0)\n# Get layer that correlates the most with mean_diff\n#scores_max = scores.argmax()\n#colors = [\"blue\"] * scores.shape[0]\n#colors[scores_max] = \"green\"\n#plt.figure(figsize=(12, 4))\n#plt.bar(x=range(scores.shape[0]), height=scores, color=colors)\n#plt.xlabel('Layer')\n#plt.title('Scores (Max of partials)')\n#plt.show()\n\n# Delta scores\n#delta = torch.cat((torch.tensor([0]), scores[1 :] - scores[ :-1]), dim=0)\n#delta[ignore_first] = 0\n#delta[n_layers - ignore_last + 1] = 0\n#epsilon = 0.00000001\n#delta = torch.maximum(delta / torch.maximum(scores, torch.tensor([epsilon] * scores.shape[0])), torch.zeros(delta.shape[0])) * 100.0\n#delta_max = delta.argmax()\n#colors = [\"blue\"] * delta.shape[0]\n#colors[delta_max] = \"green\"\n#plt.figure(figsize=(8, 4))\n#plt.bar(x=range(delta.shape[0]), height=delta, color=colors)\n#plt.xlabel('Layer')\n#plt.title('Delta Scores (% Change)')\n#plt.show()\n \n# Set layer_index to top scoring layer\n#layer_index = 37 #scores_max #delta_max\n#if layer_index == -1: # Fallback to 50% point\n# layer_index = n_layers // 2\n#print(f\"Using layer index: {layer_index}\")\n\n'''\n\n# Save ideal layer mean_diff as refusal direction\n#mean_inp_diff = -mean_inp_diff #[layer_index]\n#mean_inp_diff_norm = mean_inp_diff / mean_inp_diff.norm(dim=0)\n#mean_attn_diff = -mean_attn_diff #[layer_index]\n#mean_attn_diff_norm = mean_attn_diff / mean_attn_diff.norm(dim=0)\n#mean_ffn_diff = -mean_ffn_diff #[layer_index]\n#mean_ffn_diff_norm = mean_ffn_diff / mean_ffn_diff.norm(dim=0)\n#refusal_direction = torch.cat((mean_inp_diff_norm.unsqueeze(0), mean_attn_diff_norm, mean_ffn_diff_norm), 0)\n#print(refusal_direction.shape)\n#layer = 29\n#layer_index = 1 + 2 * layer\n\n# Directions path\nif not os.path.exists(local_repo_dir):\n os.makedirs(local_repo_dir)\n# Token embedding\ntorch.save(mean_diff_tok, local_repo_dir + \"/\" + \"direction_tok.pt\")\n# Attention\ntorch.save(mean_diff_attn[1:], local_repo_dir + \"/\" + \"direction_attn.pt\")\n# FFN\ntorch.save(mean_diff_ffn[1:], local_repo_dir + \"/\" + \"direction_ffn.pt\")\n\n# Clean-up\ncos_sim_abs_layer_sum = None\ncos_sim_abs_pc_sum = None\ncos_sim_abs = None\ncolors = None\ncomponents_part = None\n#deltas = None\n#scores = None\nlayer_scores = None\nmean_diff = None\nharmful_mean = None\nharmless_mean = None\nharmful_data = None\nharmless_data = None\ngc.collect()","metadata":{"execution":{"iopub.status.busy":"2024-09-24T10:16:35.535044Z","iopub.execute_input":"2024-09-24T10:16:35.535492Z","iopub.status.idle":"2024-09-24T10:16:58.886671Z","shell.execute_reply.started":"2024-09-24T10:16:35.535444Z","shell.execute_reply":"2024-09-24T10:16:58.885462Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"/kaggle/working\n","output_type":"stream"},{"output_type":"display_data","data":{"text/html":" \n
\n"},"metadata":{}},{"output_type":"display_data","data":{"application/javascript":"'use strict';\n(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"