geekyrakshit commited on
Commit
a70d6a8
·
1 Parent(s): 4a81263

update: docs

Browse files
docs/guardrails/llama_prompt_guardrail.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Llama Prompt Guardrail
2
+
3
+ ::: guardrails_genie.guardrails.injection.llama_prompt_guardrail
guardrails_genie/guardrails/injection/llama_prompt_guardrail.py CHANGED
@@ -9,6 +9,25 @@ from ..base import Guardrail
9
 
10
 
11
  class PromptInjectionLlamaGuardrail(Guardrail):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  model_name: str = "meta-llama/Prompt-Guard-86M"
13
  max_sequence_length: int = 512
14
  temperature: float = 1.0
@@ -47,6 +66,26 @@ class PromptInjectionLlamaGuardrail(Guardrail):
47
  ).item(),
48
  }
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  @weave.op()
51
  def guard(self, prompt: str):
52
  score = self.get_score(prompt)
 
9
 
10
 
11
  class PromptInjectionLlamaGuardrail(Guardrail):
12
+ """
13
+ A guardrail class designed to detect and mitigate prompt injection attacks
14
+ using a pre-trained language model. This class leverages a sequence
15
+ classification model to evaluate prompts for potential security threats
16
+ such as jailbreak attempts and indirect injection attempts.
17
+
18
+ Attributes:
19
+ model_name (str): The name of the pre-trained model used for sequence
20
+ classification.
21
+ max_sequence_length (int): The maximum length of the input sequence
22
+ for the tokenizer.
23
+ temperature (float): A scaling factor for the model's logits to
24
+ control the randomness of predictions.
25
+ jailbreak_score_threshold (float): The threshold above which a prompt
26
+ is considered a jailbreak attempt.
27
+ indirect_injection_score_threshold (float): The threshold above which
28
+ a prompt is considered an indirect injection attempt.
29
+ """
30
+
31
  model_name: str = "meta-llama/Prompt-Guard-86M"
32
  max_sequence_length: int = 512
33
  temperature: float = 1.0
 
66
  ).item(),
67
  }
68
 
69
+ """
70
+ Analyzes a given prompt to determine its safety by evaluating the likelihood
71
+ of it being a jailbreak or indirect injection attempt.
72
+
73
+ This function utilizes the `get_score` method to obtain the probabilities
74
+ associated with the prompt being a jailbreak or indirect injection attempt.
75
+ It then compares these probabilities against predefined thresholds to assess
76
+ the prompt's safety. If the `jailbreak_score` exceeds the `jailbreak_score_threshold`,
77
+ the prompt is flagged as a potential jailbreak attempt, and a confidence level
78
+ is calculated and included in the summary. Similarly, if the `indirect_injection_score`
79
+ surpasses the `indirect_injection_score_threshold`, the prompt is flagged as a potential
80
+ indirect injection attempt, with its confidence level also included in the summary.
81
+
82
+ Returns a dictionary containing:
83
+ - "safe": A boolean indicating whether the prompt is considered safe
84
+ (i.e., both scores are below their respective thresholds).
85
+ - "summary": A string summarizing the findings, including confidence levels
86
+ for any detected threats.
87
+ """
88
+
89
  @weave.op()
90
  def guard(self, prompt: str):
91
  score = self.get_score(prompt)
mkdocs.yml CHANGED
@@ -72,7 +72,8 @@ nav:
72
  - LLM Judge for Entity Recognition Guardrail: 'guardrails/entity_recognition/llm_judge_entity_recognition_guardrail.md'
73
  - Prompt Injection Guardrails:
74
  - Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
75
- - Survey Guardrail: 'guardrails/prompt_injection/llm_survey.md'
 
76
  - Secrets Detection Guardrail: "guardrails/secrets_detection.md"
77
  - LLM: 'llm.md'
78
  - Metrics: 'metrics.md'
 
72
  - LLM Judge for Entity Recognition Guardrail: 'guardrails/entity_recognition/llm_judge_entity_recognition_guardrail.md'
73
  - Prompt Injection Guardrails:
74
  - Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
75
+ - Llama Prompt Guardrail: 'guardrails/prompt_injection/llama_prompt_guardrail.md'
76
+ - LLM Survey Guardrail: 'guardrails/prompt_injection/llm_survey.md'
77
  - Secrets Detection Guardrail: "guardrails/secrets_detection.md"
78
  - LLM: 'llm.md'
79
  - Metrics: 'metrics.md'