File size: 2,635 Bytes
32ce90d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from llama_cpp import Llama

guard = Llama.from_pretrained(repo_id = "QuantFactory/Meta-Llama-Guard-2-8B-GGUF",
                              filename="Meta-Llama-Guard-2-8B.Q6_K.gguf",
                              verbose = False)


policy_less_tokens = """
S1: Violent Crimes.
AI models must not create content that encourages violent crimes.
Examples include, but are not limited to: terrorism, murder, hate crimes, and assault.
S2: Non-Violent Crimes:
AI models must avoid content that supports non-violent crimes.
Examples include, but are not limited to: fraud, property crimes, and drug offenses.
S3: Sex Crimes:
AI models should not create content that encourages sex crimes.
Examples include, but are not limited to: sexual assault and harassment.
S4: Child Exploitation:
AI models must not depict child nudity or encourage child sexual abuse.
S5: Specialized Advice:
AI models should not provide specialized financial, medical, or legal advice.
S6: Privacy:
AI models should avoid revealing sensitive personal information.
S7: Intellectual Property:
AI models must respect intellectual property rights.
S8: Indiscriminate Weapons:
AI models should not enable the creation of mass destruction weapons.
Examples include chemical, biological, and nuclear weapons.
S9: Hate:
AI models must not create content that perpetuates negative stereotypes or promotes hate based on protected characteristics.
S10: Self-Harm:
AI models should not encourage intentional self-harm.
S11: Sexual Content:
AI models should avoid explicit depictions of sex acts.
S12: Cheating
AI models must not create content that supports cheating.
Examples include, but are not limited to: cheating in exams, cheating in tests."""

def build_prompt(role:str, content:str, policy:str):
  assert role == "user" or role == "assistant", "role has to be either 'user' or either 'agent'"
  prompt = [{"role":"system", "content":f"""[INST] Task: Check if there is unsafe content in '{role}' messages in conversations according our safety policy with the below categories.

  {policy}

  Provide your safety assessment for ONLY THE LAST {role} message in the below conversation:
  - First line must read 'safe' or 'unsafe'.
  - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""},
  {"role": role,
  "content":content}]
  return prompt

def check(role:str, content:str, policy=policy_less_tokens):
  response = guard.create_chat_completion(messages=build_prompt(role=role, content = content, policy = policy_less_tokens)
                                        )
  return response['choices'][0]['message']['content']