Spaces:

wandb
/

guardrails-genie

Running

App Files Files Community

geekyrakshit commited on Dec 4, 2024

Commit

a70d6a8

1 Parent(s): 4a81263

update: docs

Browse files

Files changed (3) hide show

docs/guardrails/llama_prompt_guardrail.md +3 -0
guardrails_genie/guardrails/injection/llama_prompt_guardrail.py +39 -0
mkdocs.yml +2 -1

docs/guardrails/llama_prompt_guardrail.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Llama Prompt Guardrail
2	+
3	+ ::: guardrails_genie.guardrails.injection.llama_prompt_guardrail

guardrails_genie/guardrails/injection/llama_prompt_guardrail.py CHANGED Viewed

@@ -9,6 +9,25 @@ from ..base import Guardrail
 class PromptInjectionLlamaGuardrail(Guardrail):
     model_name: str = "meta-llama/Prompt-Guard-86M"
     max_sequence_length: int = 512
     temperature: float = 1.0
@@ -47,6 +66,26 @@ class PromptInjectionLlamaGuardrail(Guardrail):
             ).item(),
         }
     @weave.op()
     def guard(self, prompt: str):
         score = self.get_score(prompt)

 class PromptInjectionLlamaGuardrail(Guardrail):
+    """
+    A guardrail class designed to detect and mitigate prompt injection attacks
+    using a pre-trained language model. This class leverages a sequence
+    classification model to evaluate prompts for potential security threats
+    such as jailbreak attempts and indirect injection attempts.
+    Attributes:
+        model_name (str): The name of the pre-trained model used for sequence
+            classification.
+        max_sequence_length (int): The maximum length of the input sequence
+            for the tokenizer.
+        temperature (float): A scaling factor for the model's logits to
+            control the randomness of predictions.
+        jailbreak_score_threshold (float): The threshold above which a prompt
+            is considered a jailbreak attempt.
+        indirect_injection_score_threshold (float): The threshold above which
+            a prompt is considered an indirect injection attempt.
+    """
     model_name: str = "meta-llama/Prompt-Guard-86M"
     max_sequence_length: int = 512
     temperature: float = 1.0
             ).item(),
         }
+    """
+    Analyzes a given prompt to determine its safety by evaluating the likelihood
+    of it being a jailbreak or indirect injection attempt.
+    This function utilizes the `get_score` method to obtain the probabilities
+    associated with the prompt being a jailbreak or indirect injection attempt.
+    It then compares these probabilities against predefined thresholds to assess
+    the prompt's safety. If the `jailbreak_score` exceeds the `jailbreak_score_threshold`,
+    the prompt is flagged as a potential jailbreak attempt, and a confidence level
+    is calculated and included in the summary. Similarly, if the `indirect_injection_score`
+    surpasses the `indirect_injection_score_threshold`, the prompt is flagged as a potential
+    indirect injection attempt, with its confidence level also included in the summary.
+    Returns a dictionary containing:
+        - "safe": A boolean indicating whether the prompt is considered safe
+          (i.e., both scores are below their respective thresholds).
+        - "summary": A string summarizing the findings, including confidence levels
+          for any detected threats.
+    """
     @weave.op()
     def guard(self, prompt: str):
         score = self.get_score(prompt)

mkdocs.yml CHANGED Viewed

@@ -72,7 +72,8 @@ nav:
       - LLM Judge for Entity Recognition Guardrail: 'guardrails/entity_recognition/llm_judge_entity_recognition_guardrail.md'
     - Prompt Injection Guardrails:
       - Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
-      - Survey Guardrail: 'guardrails/prompt_injection/llm_survey.md'
     - Secrets Detection Guardrail: "guardrails/secrets_detection.md"
   - LLM: 'llm.md'
   - Metrics: 'metrics.md'

       - LLM Judge for Entity Recognition Guardrail: 'guardrails/entity_recognition/llm_judge_entity_recognition_guardrail.md'
     - Prompt Injection Guardrails:
       - Classifier Guardrail: 'guardrails/prompt_injection/classifier.md'
+      - Llama Prompt Guardrail: 'guardrails/prompt_injection/llama_prompt_guardrail.md'
+      - LLM Survey Guardrail: 'guardrails/prompt_injection/llm_survey.md'
     - Secrets Detection Guardrail: "guardrails/secrets_detection.md"
   - LLM: 'llm.md'
   - Metrics: 'metrics.md'