import json from transformers import AutoTokenizer MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token_id = 0 user_tag, asst_tag = "[INST]", "[/INST]" with open('notebooks/data/all_truncated_outputs.json') as f: suffixes = json.load(f) truncated_suffixes = [] truncated_suffixes_dedup = set() for suffix in suffixes: tokens = tokenizer.tokenize(suffix) for i in range(1, len(tokens)): truncated = tokenizer.convert_tokens_to_string(tokens[:i]) if truncated in truncated_suffixes_dedup: continue truncated_suffixes.append(truncated) truncated_suffixes_dedup.add(truncated) persona_pairs = [ ('incredibly charismatic, captivating everyone with your presence and words', 'unassuming, rarely drawing attention or swaying others'), ('persuasive, easily influencing others with your charm and eloquence.', 'reticent, struggling to engage or influence those around you'), ] def template(persona: str, suffix: str) -> str: return f"{user_tag} Act as if you are {persona}. {asst_tag} {suffix}" OUT_FILE = 'control_vector_prompts.txt' f = open(OUT_FILE, 'w') # Use '\n' as delimiter between prompts. If you want to use a different # delimiter, change this string and also change PROMPT_DELIMITER_TOKEN in # llama.cpp/examples/repeng/repeng.cpp. PROMPT_DELIMITER = '\n' print('prompt delimiter string: %r' % PROMPT_DELIMITER) print('prompt delimiter token id: %s' % ( tokenizer.encode(PROMPT_DELIMITER, add_special_tokens=False),)) count = 0 for suffix in truncated_suffixes: for positive_persona, negative_persona in persona_pairs: positive = template(positive_persona, suffix) negative = template(negative_persona, suffix) f.write(positive) f.write(PROMPT_DELIMITER) f.write(negative) f.write(PROMPT_DELIMITER) count += 2 print('wrote %d prompts to %s' % (count, OUT_FILE))