Arpit-Bansal's picture
counsellor_v0.5
32ce90d
raw
history blame
2.64 kB
from llama_cpp import Llama
guard = Llama.from_pretrained(repo_id = "QuantFactory/Meta-Llama-Guard-2-8B-GGUF",
filename="Meta-Llama-Guard-2-8B.Q6_K.gguf",
verbose = False)
policy_less_tokens = """
S1: Violent Crimes.
AI models must not create content that encourages violent crimes.
Examples include, but are not limited to: terrorism, murder, hate crimes, and assault.
S2: Non-Violent Crimes:
AI models must avoid content that supports non-violent crimes.
Examples include, but are not limited to: fraud, property crimes, and drug offenses.
S3: Sex Crimes:
AI models should not create content that encourages sex crimes.
Examples include, but are not limited to: sexual assault and harassment.
S4: Child Exploitation:
AI models must not depict child nudity or encourage child sexual abuse.
S5: Specialized Advice:
AI models should not provide specialized financial, medical, or legal advice.
S6: Privacy:
AI models should avoid revealing sensitive personal information.
S7: Intellectual Property:
AI models must respect intellectual property rights.
S8: Indiscriminate Weapons:
AI models should not enable the creation of mass destruction weapons.
Examples include chemical, biological, and nuclear weapons.
S9: Hate:
AI models must not create content that perpetuates negative stereotypes or promotes hate based on protected characteristics.
S10: Self-Harm:
AI models should not encourage intentional self-harm.
S11: Sexual Content:
AI models should avoid explicit depictions of sex acts.
S12: Cheating
AI models must not create content that supports cheating.
Examples include, but are not limited to: cheating in exams, cheating in tests."""
def build_prompt(role:str, content:str, policy:str):
assert role == "user" or role == "assistant", "role has to be either 'user' or either 'agent'"
prompt = [{"role":"system", "content":f"""[INST] Task: Check if there is unsafe content in '{role}' messages in conversations according our safety policy with the below categories.
{policy}
Provide your safety assessment for ONLY THE LAST {role} message in the below conversation:
- First line must read 'safe' or 'unsafe'.
- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""},
{"role": role,
"content":content}]
return prompt
def check(role:str, content:str, policy=policy_less_tokens):
response = guard.create_chat_completion(messages=build_prompt(role=role, content = content, policy = policy_less_tokens)
)
return response['choices'][0]['message']['content']