hackerbyhobby commited on
Commit
63a0483
·
unverified ·
1 Parent(s): 2baadab

updated app to have user choose text or OCR

Browse files
Files changed (2) hide show
  1. app.py +44 -14
  2. app.py.jan27 → app.py.working_ocr +8 -88
app.py CHANGED
@@ -21,6 +21,7 @@ model_name = "joeddav/xlm-roberta-large-xnli"
21
  classifier = pipeline("zero-shot-classification", model=model_name)
22
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
23
 
 
24
  def get_keywords_by_language(text: str):
25
  """
26
  Detect language using `langdetect` and translate keywords if needed.
@@ -42,6 +43,7 @@ def get_keywords_by_language(text: str):
42
  else:
43
  return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
44
 
 
45
  def boost_probabilities(probabilities: dict, text: str):
46
  """
47
  Boost probabilities based on keyword matches and presence of URLs.
@@ -52,11 +54,13 @@ def boost_probabilities(probabilities: dict, text: str):
52
  smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
53
  other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
54
 
 
55
  smishing_boost = 0.30 * smishing_count
56
  other_scam_boost = 0.30 * other_scam_count
57
 
58
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
59
  if found_urls:
 
60
  smishing_boost += 0.35
61
 
62
  p_smishing = probabilities.get("SMiShing", 0.0)
@@ -67,10 +71,12 @@ def boost_probabilities(probabilities: dict, text: str):
67
  p_other_scam += other_scam_boost
68
  p_legit -= (smishing_boost + other_scam_boost)
69
 
 
70
  p_smishing = max(p_smishing, 0.0)
71
  p_other_scam = max(p_other_scam, 0.0)
72
  p_legit = max(p_legit, 0.0)
73
 
 
74
  total = p_smishing + p_other_scam + p_legit
75
  if total > 0:
76
  p_smishing /= total
@@ -86,15 +92,23 @@ def boost_probabilities(probabilities: dict, text: str):
86
  "detected_lang": detected_lang
87
  }
88
 
89
- def smishing_detector(text, image):
 
90
  """
91
- Main detection function combining text and OCR.
 
 
92
  """
93
- combined_text = text or ""
94
- if image is not None:
95
- ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
96
- combined_text += " " + ocr_text
97
- combined_text = combined_text.strip()
 
 
 
 
 
98
 
99
  if not combined_text:
100
  return {
@@ -105,19 +119,26 @@ def smishing_detector(text, image):
105
  "urls_found": []
106
  }
107
 
 
108
  result = classifier(
109
  sequences=combined_text,
110
  candidate_labels=CANDIDATE_LABELS,
111
  hypothesis_template="This message is {}."
112
  )
113
  original_probs = {k: float(v) for k, v in zip(result["labels"], result["scores"])}
 
 
114
  boosted = boost_probabilities(original_probs, combined_text)
115
 
 
116
  boosted = {k: float(v) for k, v in boosted.items() if isinstance(v, (int, float))}
117
  detected_lang = boosted.pop("detected_lang", "en")
 
 
118
  final_label = max(boosted, key=boosted.get)
119
  final_confidence = round(boosted[final_label], 3)
120
 
 
121
  lower_text = combined_text.lower()
122
  smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
123
 
@@ -137,26 +158,35 @@ def smishing_detector(text, image):
137
  "urls_found": found_urls,
138
  }
139
 
 
 
140
  demo = gr.Interface(
141
  fn=smishing_detector,
142
  inputs=[
 
 
 
 
 
 
143
  gr.Textbox(
144
  lines=3,
145
- label="Paste Suspicious SMS Text (English/Spanish)",
146
  placeholder="Type or paste the message here..."
147
  ),
148
  gr.Image(
149
  type="pil",
150
- label="Or Upload a Screenshot (Optional)"
 
151
  )
152
  ],
153
  outputs="json",
154
- title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
155
  description="""
156
- This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
157
- (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
158
- If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
159
- Any URL found further boosts SMiShing specifically.
160
  """,
161
  allow_flagging="never"
162
  )
 
21
  classifier = pipeline("zero-shot-classification", model=model_name)
22
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
23
 
24
+
25
  def get_keywords_by_language(text: str):
26
  """
27
  Detect language using `langdetect` and translate keywords if needed.
 
43
  else:
44
  return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
45
 
46
+
47
  def boost_probabilities(probabilities: dict, text: str):
48
  """
49
  Boost probabilities based on keyword matches and presence of URLs.
 
54
  smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
55
  other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
56
 
57
+ # Example: 30% per found keyword
58
  smishing_boost = 0.30 * smishing_count
59
  other_scam_boost = 0.30 * other_scam_count
60
 
61
  found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
62
  if found_urls:
63
+ # 35% boost for Smishing if there's a URL
64
  smishing_boost += 0.35
65
 
66
  p_smishing = probabilities.get("SMiShing", 0.0)
 
71
  p_other_scam += other_scam_boost
72
  p_legit -= (smishing_boost + other_scam_boost)
73
 
74
+ # Clamp to 0
75
  p_smishing = max(p_smishing, 0.0)
76
  p_other_scam = max(p_other_scam, 0.0)
77
  p_legit = max(p_legit, 0.0)
78
 
79
+ # Re-normalize
80
  total = p_smishing + p_other_scam + p_legit
81
  if total > 0:
82
  p_smishing /= total
 
92
  "detected_lang": detected_lang
93
  }
94
 
95
+
96
+ def smishing_detector(input_type, text, image):
97
  """
98
+ Main detection function:
99
+ - If input_type == "Text": use `text` as the message
100
+ - If input_type == "Screenshot": use OCR on `image` to get text
101
  """
102
+ if input_type == "Text":
103
+ # Use the pasted text
104
+ combined_text = text.strip() if text else ""
105
+ else:
106
+ # input_type == "Screenshot"
107
+ if image is not None:
108
+ ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
109
+ combined_text = ocr_text.strip()
110
+ else:
111
+ combined_text = ""
112
 
113
  if not combined_text:
114
  return {
 
119
  "urls_found": []
120
  }
121
 
122
+ # Zero-shot classification
123
  result = classifier(
124
  sequences=combined_text,
125
  candidate_labels=CANDIDATE_LABELS,
126
  hypothesis_template="This message is {}."
127
  )
128
  original_probs = {k: float(v) for k, v in zip(result["labels"], result["scores"])}
129
+
130
+ # Boost logic
131
  boosted = boost_probabilities(original_probs, combined_text)
132
 
133
+ # Convert to float
134
  boosted = {k: float(v) for k, v in boosted.items() if isinstance(v, (int, float))}
135
  detected_lang = boosted.pop("detected_lang", "en")
136
+
137
+ # Final classification
138
  final_label = max(boosted, key=boosted.get)
139
  final_confidence = round(boosted[final_label], 3)
140
 
141
+ # For display
142
  lower_text = combined_text.lower()
143
  smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
144
 
 
158
  "urls_found": found_urls,
159
  }
160
 
161
+
162
+ # Create a Radio for user choice + text input + image input
163
  demo = gr.Interface(
164
  fn=smishing_detector,
165
  inputs=[
166
+ gr.Radio(
167
+ choices=["Text", "Screenshot"],
168
+ label="Choose input type",
169
+ value="Text", # default
170
+ info="Select 'Text' to paste a message, or 'Screenshot' to upload an image."
171
+ ),
172
  gr.Textbox(
173
  lines=3,
174
+ label="Paste Suspicious SMS Text",
175
  placeholder="Type or paste the message here..."
176
  ),
177
  gr.Image(
178
  type="pil",
179
+ label="Upload a Screenshot",
180
+ tool="editor"
181
  )
182
  ],
183
  outputs="json",
184
+ title="SMiShing & Scam Detector",
185
  description="""
186
+ Select "Text" or "Screenshot" above.
187
+ - If "Text", only use the textbox.
188
+ - If "Screenshot", only upload an image.
189
+ The app will classify the message as SMiShing, Other Scam, or Legitimate.
190
  """,
191
  allow_flagging="never"
192
  )
app.py.jan27 → app.py.working_ocr RENAMED
@@ -5,22 +5,6 @@ from transformers import pipeline
5
  import re
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
8
- import shap
9
- import requests
10
- import json
11
- import os
12
- import numpy as np
13
- from shap.maskers import Text
14
-
15
- # Patch SHAP to replace np.bool with np.bool_ dynamically
16
- if hasattr(shap.maskers._text.Text, "invariants"):
17
- original_invariants = shap.maskers._text.Text.invariants
18
-
19
- def patched_invariants(self, *args):
20
- # Use np.bool_ instead of the deprecated np.bool
21
- return np.zeros(len(self._tokenized_s), dtype=np.bool_)
22
-
23
- shap.maskers._text.Text.invariants = patched_invariants
24
 
25
  # Translator instance
26
  translator = GoogleTranslator(source="auto", target="es")
@@ -37,49 +21,6 @@ model_name = "joeddav/xlm-roberta-large-xnli"
37
  classifier = pipeline("zero-shot-classification", model=model_name)
38
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
39
 
40
- # 3. SHAP Explainer Setup
41
- explainer = shap.Explainer(classifier, masker=Text(tokenizer=classifier.tokenizer))
42
-
43
- # Retrieve the Google Safe Browsing API key from the environment
44
- SAFE_BROWSING_API_KEY = os.getenv("SAFE_BROWSING_API_KEY")
45
-
46
- if not SAFE_BROWSING_API_KEY:
47
- raise ValueError("Google Safe Browsing API key not found. Please set it as an environment variable in your Hugging Face Space.")
48
-
49
- SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
50
-
51
- def check_url_with_google_safebrowsing(url):
52
- """
53
- Check a URL against Google's Safe Browsing API.
54
- """
55
- payload = {
56
- "client": {
57
- "clientId": "your-client-id",
58
- "clientVersion": "1.0"
59
- },
60
- "threatInfo": {
61
- "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
62
- "platformTypes": ["ANY_PLATFORM"],
63
- "threatEntryTypes": ["URL"],
64
- "threatEntries": [
65
- {"url": url}
66
- ]
67
- }
68
- }
69
- try:
70
- response = requests.post(
71
- SAFE_BROWSING_URL,
72
- params={"key": SAFE_BROWSING_API_KEY},
73
- json=payload
74
- )
75
- response_data = response.json()
76
- if "matches" in response_data:
77
- return True # URL is flagged as malicious
78
- return False # URL is safe
79
- except Exception as e:
80
- print(f"Error checking URL with Safe Browsing API: {e}")
81
- return False
82
-
83
  def get_keywords_by_language(text: str):
84
  """
85
  Detect language using `langdetect` and translate keywords if needed.
@@ -142,21 +83,9 @@ def boost_probabilities(probabilities: dict, text: str):
142
  "SMiShing": p_smishing,
143
  "Other Scam": p_other_scam,
144
  "Legitimate": p_legit,
145
- "detected_lang": detected_lang,
146
  }
147
 
148
- def explain_classification(text):
149
- """
150
- Generate SHAP explanations for the classification.
151
- """
152
- if not text.strip():
153
- raise ValueError("Cannot generate SHAP explanations for empty text.")
154
-
155
- shap_values = explainer([text])
156
- shap.force_plot(
157
- explainer.expected_value[0], shap_values[0].values[0], shap_values[0].data
158
- )
159
-
160
  def smishing_detector(text, image):
161
  """
162
  Main detection function combining text and OCR.
@@ -173,8 +102,7 @@ def smishing_detector(text, image):
173
  "label": "No text provided",
174
  "confidence": 0.0,
175
  "keywords_found": [],
176
- "urls_found": [],
177
- "threat_analysis": "No URLs to analyze",
178
  }
179
 
180
  result = classifier(
@@ -197,14 +125,6 @@ def smishing_detector(text, image):
197
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
198
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
199
 
200
- # Analyze URLs using Google's Safe Browsing API
201
- threat_analysis = {
202
- url: check_url_with_google_safebrowsing(url) for url in found_urls
203
- }
204
-
205
- # SHAP Explanation (optional for user insights)
206
- explain_classification(combined_text)
207
-
208
  return {
209
  "detected_language": detected_lang,
210
  "text_used_for_classification": combined_text,
@@ -215,7 +135,6 @@ def smishing_detector(text, image):
215
  "smishing_keywords_found": found_smishing,
216
  "other_scam_keywords_found": found_other_scam,
217
  "urls_found": found_urls,
218
- "threat_analysis": threat_analysis,
219
  }
220
 
221
  demo = gr.Interface(
@@ -232,14 +151,15 @@ demo = gr.Interface(
232
  )
233
  ],
234
  outputs="json",
235
- title="SMiShing & Scam Detector with Safe Browsing",
236
  description="""
237
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
238
  (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
239
- It uses SHAP for explainability and checks URLs against Google's Safe Browsing API for enhanced analysis.
240
- """,
241
- flagging_mode="never"
 
242
  )
243
 
244
  if __name__ == "__main__":
245
- demo.launch()
 
5
  import re
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Translator instance
10
  translator = GoogleTranslator(source="auto", target="es")
 
21
  classifier = pipeline("zero-shot-classification", model=model_name)
22
  CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_keywords_by_language(text: str):
25
  """
26
  Detect language using `langdetect` and translate keywords if needed.
 
83
  "SMiShing": p_smishing,
84
  "Other Scam": p_other_scam,
85
  "Legitimate": p_legit,
86
+ "detected_lang": detected_lang
87
  }
88
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def smishing_detector(text, image):
90
  """
91
  Main detection function combining text and OCR.
 
102
  "label": "No text provided",
103
  "confidence": 0.0,
104
  "keywords_found": [],
105
+ "urls_found": []
 
106
  }
107
 
108
  result = classifier(
 
125
  found_smishing = [kw for kw in smishing_keys if kw in lower_text]
126
  found_other_scam = [kw for kw in scam_keys if kw in lower_text]
127
 
 
 
 
 
 
 
 
 
128
  return {
129
  "detected_language": detected_lang,
130
  "text_used_for_classification": combined_text,
 
135
  "smishing_keywords_found": found_smishing,
136
  "other_scam_keywords_found": found_other_scam,
137
  "urls_found": found_urls,
 
138
  }
139
 
140
  demo = gr.Interface(
 
151
  )
152
  ],
153
  outputs="json",
154
+ title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
155
  description="""
156
  This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
157
  (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
158
+ If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
159
+ Any URL found further boosts SMiShing specifically.
160
+ """,
161
+ allow_flagging="never"
162
  )
163
 
164
  if __name__ == "__main__":
165
+ demo.launch()