geekyrakshit commited on
Commit
4a81263
·
unverified ·
2 Parent(s): 4c92daa a6ca408

Merge pull request #11 from soumik12345/feat/llama-guard

Browse files
guardrails_genie/guardrails/__init__.py CHANGED
@@ -1,11 +1,15 @@
1
- from guardrails_genie.guardrails.entity_recognition import (
2
- PresidioEntityRecognitionGuardrail,
3
- RegexEntityRecognitionGuardrail,
4
- RestrictedTermsJudge,
5
- TransformersEntityRecognitionGuardrail,
6
- )
 
 
 
7
  from guardrails_genie.guardrails.injection import (
8
  PromptInjectionClassifierGuardrail,
 
9
  PromptInjectionSurveyGuardrail,
10
  )
11
  from guardrails_genie.guardrails.secrets_detection import SecretsDetectionGuardrail
@@ -13,6 +17,7 @@ from guardrails_genie.guardrails.secrets_detection import SecretsDetectionGuardr
13
  from .manager import GuardrailManager
14
 
15
  __all__ = [
 
16
  "PromptInjectionSurveyGuardrail",
17
  "PromptInjectionClassifierGuardrail",
18
  "PresidioEntityRecognitionGuardrail",
 
1
+ try:
2
+ from guardrails_genie.guardrails.entity_recognition import (
3
+ PresidioEntityRecognitionGuardrail,
4
+ RegexEntityRecognitionGuardrail,
5
+ RestrictedTermsJudge,
6
+ TransformersEntityRecognitionGuardrail,
7
+ )
8
+ except ImportError:
9
+ pass
10
  from guardrails_genie.guardrails.injection import (
11
  PromptInjectionClassifierGuardrail,
12
+ PromptInjectionLlamaGuardrail,
13
  PromptInjectionSurveyGuardrail,
14
  )
15
  from guardrails_genie.guardrails.secrets_detection import SecretsDetectionGuardrail
 
17
  from .manager import GuardrailManager
18
 
19
  __all__ = [
20
+ "PromptInjectionLlamaGuardrail",
21
  "PromptInjectionSurveyGuardrail",
22
  "PromptInjectionClassifierGuardrail",
23
  "PresidioEntityRecognitionGuardrail",
guardrails_genie/guardrails/entity_recognition/__init__.py CHANGED
@@ -1,5 +1,16 @@
 
 
1
  from .llm_judge_entity_recognition_guardrail import RestrictedTermsJudge
2
- from .presidio_entity_recognition_guardrail import PresidioEntityRecognitionGuardrail
 
 
 
 
 
 
 
 
 
3
  from .regex_entity_recognition_guardrail import RegexEntityRecognitionGuardrail
4
  from .transformers_entity_recognition_guardrail import (
5
  TransformersEntityRecognitionGuardrail,
 
1
+ import warnings
2
+
3
  from .llm_judge_entity_recognition_guardrail import RestrictedTermsJudge
4
+
5
+ try:
6
+ from .presidio_entity_recognition_guardrail import (
7
+ PresidioEntityRecognitionGuardrail,
8
+ )
9
+ except ImportError:
10
+ warnings.warn(
11
+ "Presidio is not installed. If you want to use `PresidioEntityRecognitionGuardrail`, you can install the required packages using `pip install -e .[presidio]`"
12
+ )
13
+
14
  from .regex_entity_recognition_guardrail import RegexEntityRecognitionGuardrail
15
  from .transformers_entity_recognition_guardrail import (
16
  TransformersEntityRecognitionGuardrail,
guardrails_genie/guardrails/injection/__init__.py CHANGED
@@ -1,4 +1,9 @@
1
  from .classifier_guardrail import PromptInjectionClassifierGuardrail
 
2
  from .survey_guardrail import PromptInjectionSurveyGuardrail
3
 
4
- __all__ = ["PromptInjectionSurveyGuardrail", "PromptInjectionClassifierGuardrail"]
 
 
 
 
 
1
  from .classifier_guardrail import PromptInjectionClassifierGuardrail
2
+ from .llama_prompt_guardrail import PromptInjectionLlamaGuardrail
3
  from .survey_guardrail import PromptInjectionSurveyGuardrail
4
 
5
+ __all__ = [
6
+ "PromptInjectionLlamaGuardrail",
7
+ "PromptInjectionSurveyGuardrail",
8
+ "PromptInjectionClassifierGuardrail",
9
+ ]
guardrails_genie/guardrails/injection/llama_prompt_guardrail.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import weave
6
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
7
+
8
+ from ..base import Guardrail
9
+
10
+
11
+ class PromptInjectionLlamaGuardrail(Guardrail):
12
+ model_name: str = "meta-llama/Prompt-Guard-86M"
13
+ max_sequence_length: int = 512
14
+ temperature: float = 1.0
15
+ jailbreak_score_threshold: float = 0.5
16
+ indirect_injection_score_threshold: float = 0.5
17
+ _tokenizer: Optional[AutoTokenizer] = None
18
+ _model: Optional[AutoModelForSequenceClassification] = None
19
+
20
+ def model_post_init(self, __context):
21
+ self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
22
+ self._model = AutoModelForSequenceClassification.from_pretrained(
23
+ self.model_name
24
+ )
25
+
26
+ def get_class_probabilities(self, prompt):
27
+ inputs = self._tokenizer(
28
+ prompt,
29
+ return_tensors="pt",
30
+ padding=True,
31
+ truncation=True,
32
+ max_length=self.max_sequence_length,
33
+ )
34
+ with torch.no_grad():
35
+ logits = self._model(**inputs).logits
36
+ scaled_logits = logits / self.temperature
37
+ probabilities = F.softmax(scaled_logits, dim=-1)
38
+ return probabilities
39
+
40
+ @weave.op()
41
+ def get_score(self, prompt: str):
42
+ probabilities = self.get_class_probabilities(prompt)
43
+ return {
44
+ "jailbreak_score": probabilities[0, 2].item(),
45
+ "indirect_injection_score": (
46
+ probabilities[0, 1] + probabilities[0, 2]
47
+ ).item(),
48
+ }
49
+
50
+ @weave.op()
51
+ def guard(self, prompt: str):
52
+ score = self.get_score(prompt)
53
+ summary = ""
54
+ if score["jailbreak_score"] > self.jailbreak_score_threshold:
55
+ confidence = round(score["jailbreak_score"] * 100, 2)
56
+ summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
57
+ if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
58
+ confidence = round(score["indirect_injection_score"] * 100, 2)
59
+ summary += f" Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
60
+ return {
61
+ "safe": score["jailbreak_score"] < self.jailbreak_score_threshold
62
+ and score["indirect_injection_score"]
63
+ < self.indirect_injection_score_threshold,
64
+ "summary": summary.strip(),
65
+ }
66
+
67
+ @weave.op()
68
+ def predict(self, prompt: str):
69
+ return self.guard(prompt)
guardrails_genie/utils.py CHANGED
@@ -1,18 +1,5 @@
1
- import os
2
-
3
  import pandas as pd
4
- import pymupdf4llm
5
  import weave
6
- import weave.trace
7
- from firerequests import FireRequests
8
-
9
-
10
- @weave.op()
11
- def get_markdown_from_pdf_url(url: str) -> str:
12
- FireRequests().download(url, "temp.pdf", show_progress=False)
13
- markdown = pymupdf4llm.to_markdown("temp.pdf", show_progress=False)
14
- os.remove("temp.pdf")
15
- return markdown
16
 
17
 
18
  class EvaluationCallManager:
 
 
 
1
  import pandas as pd
 
2
  import weave
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  class EvaluationCallManager:
pyproject.toml CHANGED
@@ -9,25 +9,27 @@ dependencies = [
9
  "evaluate>=0.4.3",
10
  "google-generativeai>=0.8.3",
11
  "openai>=1.52.2",
12
- "isort>=5.13.2",
13
- "black>=24.10.0",
14
- "ruff>=0.6.9",
15
- "pip>=24.2",
16
- "uv>=0.4.20",
17
  "weave @ git+https://github.com/wandb/weave@feat/eval-progressbar",
18
  "streamlit>=1.40.1",
19
  "python-dotenv>=1.0.1",
20
  "watchdog>=6.0.0",
21
- "firerequests>=0.1.1",
22
- "pymupdf4llm>=0.0.17",
23
  "transformers>=4.46.3",
24
  "torch>=2.5.1",
25
- "presidio-analyzer>=2.2.355",
26
- "presidio-anonymizer>=2.2.355",
27
  "instructor>=1.7.0",
28
  ]
29
 
30
  [project.optional-dependencies]
 
 
 
 
 
 
 
 
 
 
 
31
  docs = [
32
  "mkdocs>=1.6.1",
33
  "mkdocstrings>=0.26.1",
 
9
  "evaluate>=0.4.3",
10
  "google-generativeai>=0.8.3",
11
  "openai>=1.52.2",
 
 
 
 
 
12
  "weave @ git+https://github.com/wandb/weave@feat/eval-progressbar",
13
  "streamlit>=1.40.1",
14
  "python-dotenv>=1.0.1",
15
  "watchdog>=6.0.0",
 
 
16
  "transformers>=4.46.3",
17
  "torch>=2.5.1",
 
 
18
  "instructor>=1.7.0",
19
  ]
20
 
21
  [project.optional-dependencies]
22
+ presidio = [
23
+ "presidio-analyzer>=2.2.355",
24
+ "presidio-anonymizer>=2.2.355",
25
+ ]
26
+ dev = [
27
+ "isort>=5.13.2",
28
+ "black>=24.10.0",
29
+ "ruff>=0.6.9",
30
+ "pip>=24.2",
31
+ "uv>=0.4.20",
32
+ ]
33
  docs = [
34
  "mkdocs>=1.6.1",
35
  "mkdocstrings>=0.26.1",