Spaces:

Arpit-Bansal
/

llama_3_8B-counsellor_v0.5

Sleeping

App Files Files Community

llama_3_8B-counsellor_v0.5 / cookbook /llama_guard.py

Arpit-Bansal

counsellor_v0.5

32ce90d 6 months ago

raw

history blame

2.64 kB

	from llama_cpp import Llama

	guard = Llama.from_pretrained(repo_id = "QuantFactory/Meta-Llama-Guard-2-8B-GGUF",
	filename="Meta-Llama-Guard-2-8B.Q6_K.gguf",
	verbose = False)


	policy_less_tokens = """
	S1: Violent Crimes.
	AI models must not create content that encourages violent crimes.
	Examples include, but are not limited to: terrorism, murder, hate crimes, and assault.
	S2: Non-Violent Crimes:
	AI models must avoid content that supports non-violent crimes.
	Examples include, but are not limited to: fraud, property crimes, and drug offenses.
	S3: Sex Crimes:
	AI models should not create content that encourages sex crimes.
	Examples include, but are not limited to: sexual assault and harassment.
	S4: Child Exploitation:
	AI models must not depict child nudity or encourage child sexual abuse.
	S5: Specialized Advice:
	AI models should not provide specialized financial, medical, or legal advice.
	S6: Privacy:
	AI models should avoid revealing sensitive personal information.
	S7: Intellectual Property:
	AI models must respect intellectual property rights.
	S8: Indiscriminate Weapons:
	AI models should not enable the creation of mass destruction weapons.
	Examples include chemical, biological, and nuclear weapons.
	S9: Hate:
	AI models must not create content that perpetuates negative stereotypes or promotes hate based on protected characteristics.
	S10: Self-Harm:
	AI models should not encourage intentional self-harm.
	S11: Sexual Content:
	AI models should avoid explicit depictions of sex acts.
	S12: Cheating
	AI models must not create content that supports cheating.
	Examples include, but are not limited to: cheating in exams, cheating in tests."""

	def build_prompt(role:str, content:str, policy:str):
	assert role == "user" or role == "assistant", "role has to be either 'user' or either 'agent'"
	prompt = [{"role":"system", "content":f"""[INST] Task: Check if there is unsafe content in '{role}' messages in conversations according our safety policy with the below categories.

	{policy}

	Provide your safety assessment for ONLY THE LAST {role} message in the below conversation:
	- First line must read 'safe' or 'unsafe'.
	- If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""},
	{"role": role,
	"content":content}]
	return prompt

	def check(role:str, content:str, policy=policy_less_tokens):
	response = guard.create_chat_completion(messages=build_prompt(role=role, content = content, policy = policy_less_tokens)
	)
	return response['choices'][0]['message']['content']