Spaces:
Running
Running
Commit
·
a70d6a8
1
Parent(s):
4a81263
update: docs
Browse files
docs/guardrails/llama_prompt_guardrail.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Llama Prompt Guardrail
|
2 |
+
|
3 |
+
::: guardrails_genie.guardrails.injection.llama_prompt_guardrail
|
guardrails_genie/guardrails/injection/llama_prompt_guardrail.py
CHANGED
@@ -9,6 +9,25 @@ from ..base import Guardrail
|
|
9 |
|
10 |
|
11 |
class PromptInjectionLlamaGuardrail(Guardrail):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
model_name: str = "meta-llama/Prompt-Guard-86M"
|
13 |
max_sequence_length: int = 512
|
14 |
temperature: float = 1.0
|
@@ -47,6 +66,26 @@ class PromptInjectionLlamaGuardrail(Guardrail):
|
|
47 |
).item(),
|
48 |
}
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
@weave.op()
|
51 |
def guard(self, prompt: str):
|
52 |
score = self.get_score(prompt)
|
|
|
9 |
|
10 |
|
11 |
class PromptInjectionLlamaGuardrail(Guardrail):
|
12 |
+
"""
|
13 |
+
A guardrail class designed to detect and mitigate prompt injection attacks
|
14 |
+
using a pre-trained language model. This class leverages a sequence
|
15 |
+
classification model to evaluate prompts for potential security threats
|
16 |
+
such as jailbreak attempts and indirect injection attempts.
|
17 |
+
|
18 |
+
Attributes:
|
19 |
+
model_name (str): The name of the pre-trained model used for sequence
|
20 |
+
classification.
|
21 |
+
max_sequence_length (int): The maximum length of the input sequence
|
22 |
+
for the tokenizer.
|
23 |
+
temperature (float): A scaling factor for the model's logits to
|
24 |
+
control the randomness of predictions.
|
25 |
+
jailbreak_score_threshold (float): The threshold above which a prompt
|
26 |
+
is considered a jailbreak attempt.
|
27 |
+
indirect_injection_score_threshold (float): The threshold above which
|
28 |
+
a prompt is considered an indirect injection attempt.
|
29 |
+
"""
|
30 |
+
|
31 |
model_name: str = "meta-llama/Prompt-Guard-86M"
|
32 |
max_sequence_length: int = 512
|
33 |
temperature: float = 1.0
|
|
|
66 |
).item(),
|
67 |
}
|
68 |
|
69 |
+
"""
|
70 |
+
Analyzes a given prompt to determine its safety by evaluating the likelihood
|
71 |
+
of it being a jailbreak or indirect injection attempt.
|
72 |
+
|
73 |
+
This function utilizes the `get_score` method to obtain the probabilities
|
74 |
+
associated with the prompt being a jailbreak or indirect injection attempt.
|
75 |
+
It then compares these probabilities against predefined thresholds to assess
|
76 |
+
the prompt's safety. If the `jailbreak_score` exceeds the `jailbreak_score_threshold`,
|
77 |
+
the prompt is flagged as a potential jailbreak attempt, and a confidence level
|
78 |
+
is calculated and included in the summary. Similarly, if the `indirect_injection_score`
|
79 |
+
surpasses the `indirect_injection_score_threshold`, the prompt is flagged as a potential
|
80 |
+
indirect injection attempt, with its confidence level also included in the summary.
|
81 |
+
|
82 |
+
Returns a dictionary containing:
|
83 |
+
- "safe": A boolean indicating whether the prompt is considered safe
|
84 |
+
(i.e., both scores are below their respective thresholds).
|
85 |
+
- "summary": A string summarizing the findings, including confidence levels
|
86 |
+
for any detected threats.
|
87 |
+
"""
|
88 |
+
|
89 |
@weave.op()
|
90 |
def guard(self, prompt: str):
|
91 |
score = self.get_score(prompt)
|
mkdocs.yml
CHANGED
@@ -72,7 +72,8 @@ nav:
|
|
72 |
- LLM Judge for Entity Recognition Guardrail: 'guardrails/entity_recognition/llm_judge_entity_recognition_guardrail.md'
|
73 |
- Prompt Injection Guardrails:
|
74 |
- Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
|
75 |
-
-
|
|
|
76 |
- Secrets Detection Guardrail: "guardrails/secrets_detection.md"
|
77 |
- LLM: 'llm.md'
|
78 |
- Metrics: 'metrics.md'
|
|
|
72 |
- LLM Judge for Entity Recognition Guardrail: 'guardrails/entity_recognition/llm_judge_entity_recognition_guardrail.md'
|
73 |
- Prompt Injection Guardrails:
|
74 |
- Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
|
75 |
+
- Llama Prompt Guardrail: 'guardrails/prompt_injection/llama_prompt_guardrail.md'
|
76 |
+
- LLM Survey Guardrail: 'guardrails/prompt_injection/llm_survey.md'
|
77 |
- Secrets Detection Guardrail: "guardrails/secrets_detection.md"
|
78 |
- LLM: 'llm.md'
|
79 |
- Metrics: 'metrics.md'
|