Spaces:

hackerbyhobby
/

SMS_scam_detection

Running

App Files Files Community

hackerbyhobby commited on 14 days ago

Commit

63a0483

unverified ·

1 Parent(s): 2baadab

updated app to have user choose text or OCR

Browse files

Files changed (2) hide show

app.py +44 -14
app.py.jan27 → app.py.working_ocr +8 -88

app.py CHANGED Viewed

@@ -21,6 +21,7 @@ model_name = "joeddav/xlm-roberta-large-xnli"
 classifier = pipeline("zero-shot-classification", model=model_name)
 CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
 def get_keywords_by_language(text: str):
     """
     Detect language using `langdetect` and translate keywords if needed.
@@ -42,6 +43,7 @@ def get_keywords_by_language(text: str):
     else:
         return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
 def boost_probabilities(probabilities: dict, text: str):
     """
     Boost probabilities based on keyword matches and presence of URLs.
@@ -52,11 +54,13 @@ def boost_probabilities(probabilities: dict, text: str):
     smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
     other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
     smishing_boost = 0.30 * smishing_count
     other_scam_boost = 0.30 * other_scam_count
     found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
     if found_urls:
         smishing_boost += 0.35
     p_smishing = probabilities.get("SMiShing", 0.0)
@@ -67,10 +71,12 @@ def boost_probabilities(probabilities: dict, text: str):
     p_other_scam += other_scam_boost
     p_legit -= (smishing_boost + other_scam_boost)
     p_smishing = max(p_smishing, 0.0)
     p_other_scam = max(p_other_scam, 0.0)
     p_legit = max(p_legit, 0.0)
     total = p_smishing + p_other_scam + p_legit
     if total > 0:
         p_smishing /= total
@@ -86,15 +92,23 @@ def boost_probabilities(probabilities: dict, text: str):
         "detected_lang": detected_lang
     }
-def smishing_detector(text, image):
     """
-    Main detection function combining text and OCR.
     """
-    combined_text = text or ""
-    if image is not None:
-        ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
-        combined_text += " " + ocr_text
-    combined_text = combined_text.strip()
     if not combined_text:
         return {
@@ -105,19 +119,26 @@ def smishing_detector(text, image):
             "urls_found": []
         }
     result = classifier(
         sequences=combined_text,
         candidate_labels=CANDIDATE_LABELS,
         hypothesis_template="This message is {}."
     )
     original_probs = {k: float(v) for k, v in zip(result["labels"], result["scores"])}
     boosted = boost_probabilities(original_probs, combined_text)
     boosted = {k: float(v) for k, v in boosted.items() if isinstance(v, (int, float))}
     detected_lang = boosted.pop("detected_lang", "en")
     final_label = max(boosted, key=boosted.get)
     final_confidence = round(boosted[final_label], 3)
     lower_text = combined_text.lower()
     smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
@@ -137,26 +158,35 @@ def smishing_detector(text, image):
         "urls_found": found_urls,
     }
 demo = gr.Interface(
     fn=smishing_detector,
     inputs=[
         gr.Textbox(
             lines=3,
-            label="Paste Suspicious SMS Text (English/Spanish)",
             placeholder="Type or paste the message here..."
         ),
         gr.Image(
             type="pil",
-            label="Or Upload a Screenshot (Optional)"
         )
     ],
     outputs="json",
-    title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
     description="""
-This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
-(joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
-If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
-Any URL found further boosts SMiShing specifically.
 """,
     allow_flagging="never"
 )

 classifier = pipeline("zero-shot-classification", model=model_name)
 CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
 def get_keywords_by_language(text: str):
     """
     Detect language using `langdetect` and translate keywords if needed.
     else:
         return SMISHING_KEYWORDS, OTHER_SCAM_KEYWORDS, "en"
 def boost_probabilities(probabilities: dict, text: str):
     """
     Boost probabilities based on keyword matches and presence of URLs.
     smishing_count = sum(1 for kw in smishing_keywords if kw in lower_text)
     other_scam_count = sum(1 for kw in other_scam_keywords if kw in lower_text)
+    # Example: 30% per found keyword
     smishing_boost = 0.30 * smishing_count
     other_scam_boost = 0.30 * other_scam_count
     found_urls = re.findall(r"(https?://[^\s]+)", lower_text)
     if found_urls:
+        # 35% boost for Smishing if there's a URL
         smishing_boost += 0.35
     p_smishing = probabilities.get("SMiShing", 0.0)
     p_other_scam += other_scam_boost
     p_legit -= (smishing_boost + other_scam_boost)
+    # Clamp to 0
     p_smishing = max(p_smishing, 0.0)
     p_other_scam = max(p_other_scam, 0.0)
     p_legit = max(p_legit, 0.0)
+    # Re-normalize
     total = p_smishing + p_other_scam + p_legit
     if total > 0:
         p_smishing /= total
         "detected_lang": detected_lang
     }
+def smishing_detector(input_type, text, image):
     """
+    Main detection function:
+      - If input_type == "Text": use `text` as the message
+      - If input_type == "Screenshot": use OCR on `image` to get text
     """
+    if input_type == "Text":
+        # Use the pasted text
+        combined_text = text.strip() if text else ""
+    else:
+        # input_type == "Screenshot"
+        if image is not None:
+            ocr_text = pytesseract.image_to_string(image, lang="spa+eng")
+            combined_text = ocr_text.strip()
+        else:
+            combined_text = ""
     if not combined_text:
         return {
             "urls_found": []
         }
+    # Zero-shot classification
     result = classifier(
         sequences=combined_text,
         candidate_labels=CANDIDATE_LABELS,
         hypothesis_template="This message is {}."
     )
     original_probs = {k: float(v) for k, v in zip(result["labels"], result["scores"])}
+    # Boost logic
     boosted = boost_probabilities(original_probs, combined_text)
+    # Convert to float
     boosted = {k: float(v) for k, v in boosted.items() if isinstance(v, (int, float))}
     detected_lang = boosted.pop("detected_lang", "en")
+    # Final classification
     final_label = max(boosted, key=boosted.get)
     final_confidence = round(boosted[final_label], 3)
+    # For display
     lower_text = combined_text.lower()
     smishing_keys, scam_keys, _ = get_keywords_by_language(combined_text)
         "urls_found": found_urls,
     }
+# Create a Radio for user choice + text input + image input
 demo = gr.Interface(
     fn=smishing_detector,
     inputs=[
+        gr.Radio(
+            choices=["Text", "Screenshot"],
+            label="Choose input type",
+            value="Text",  # default
+            info="Select 'Text' to paste a message, or 'Screenshot' to upload an image."
+        ),
         gr.Textbox(
             lines=3,
+            label="Paste Suspicious SMS Text",
             placeholder="Type or paste the message here..."
         ),
         gr.Image(
             type="pil",
+            label="Upload a Screenshot",
+            tool="editor"
         )
     ],
     outputs="json",
+    title="SMiShing & Scam Detector",
     description="""
+Select "Text" or "Screenshot" above.
+- If "Text", only use the textbox.
+- If "Screenshot", only upload an image.
+The app will classify the message as SMiShing, Other Scam, or Legitimate.
 """,
     allow_flagging="never"
 )

app.py.jan27 → app.py.working_ocr RENAMED Viewed

@@ -5,22 +5,6 @@ from transformers import pipeline
 import re
 from langdetect import detect
 from deep_translator import GoogleTranslator
-import shap
-import requests
-import json
-import os
-import numpy as np
-from shap.maskers import Text
-# Patch SHAP to replace np.bool with np.bool_ dynamically
-if hasattr(shap.maskers._text.Text, "invariants"):
-    original_invariants = shap.maskers._text.Text.invariants
-    def patched_invariants(self, *args):
-        # Use np.bool_ instead of the deprecated np.bool
-        return np.zeros(len(self._tokenized_s), dtype=np.bool_)
-    shap.maskers._text.Text.invariants = patched_invariants
 # Translator instance
 translator = GoogleTranslator(source="auto", target="es")
@@ -37,49 +21,6 @@ model_name = "joeddav/xlm-roberta-large-xnli"
 classifier = pipeline("zero-shot-classification", model=model_name)
 CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
-# 3. SHAP Explainer Setup
-explainer = shap.Explainer(classifier, masker=Text(tokenizer=classifier.tokenizer))
-# Retrieve the Google Safe Browsing API key from the environment
-SAFE_BROWSING_API_KEY = os.getenv("SAFE_BROWSING_API_KEY")
-if not SAFE_BROWSING_API_KEY:
-    raise ValueError("Google Safe Browsing API key not found. Please set it as an environment variable in your Hugging Face Space.")
-SAFE_BROWSING_URL = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
-def check_url_with_google_safebrowsing(url):
-    """
-    Check a URL against Google's Safe Browsing API.
-    """
-    payload = {
-        "client": {
-            "clientId": "your-client-id",
-            "clientVersion": "1.0"
-        },
-        "threatInfo": {
-            "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
-            "platformTypes": ["ANY_PLATFORM"],
-            "threatEntryTypes": ["URL"],
-            "threatEntries": [
-                {"url": url}
-            ]
-        }
-    }
-    try:
-        response = requests.post(
-            SAFE_BROWSING_URL,
-            params={"key": SAFE_BROWSING_API_KEY},
-            json=payload
-        )
-        response_data = response.json()
-        if "matches" in response_data:
-            return True  # URL is flagged as malicious
-        return False  # URL is safe
-    except Exception as e:
-        print(f"Error checking URL with Safe Browsing API: {e}")
-        return False
 def get_keywords_by_language(text: str):
     """
     Detect language using `langdetect` and translate keywords if needed.
@@ -142,21 +83,9 @@ def boost_probabilities(probabilities: dict, text: str):
         "SMiShing": p_smishing,
         "Other Scam": p_other_scam,
         "Legitimate": p_legit,
-        "detected_lang": detected_lang,
     }
-def explain_classification(text):
-    """
-    Generate SHAP explanations for the classification.
-    """
-    if not text.strip():
-        raise ValueError("Cannot generate SHAP explanations for empty text.")
-    shap_values = explainer([text])
-    shap.force_plot(
-        explainer.expected_value[0], shap_values[0].values[0], shap_values[0].data
-    )
 def smishing_detector(text, image):
     """
     Main detection function combining text and OCR.
@@ -173,8 +102,7 @@ def smishing_detector(text, image):
             "label": "No text provided",
             "confidence": 0.0,
             "keywords_found": [],
-            "urls_found": [],
-            "threat_analysis": "No URLs to analyze",
         }
     result = classifier(
@@ -197,14 +125,6 @@ def smishing_detector(text, image):
     found_smishing = [kw for kw in smishing_keys if kw in lower_text]
     found_other_scam = [kw for kw in scam_keys if kw in lower_text]
-    # Analyze URLs using Google's Safe Browsing API
-    threat_analysis = {
-        url: check_url_with_google_safebrowsing(url) for url in found_urls
-    }
-    # SHAP Explanation (optional for user insights)
-    explain_classification(combined_text)
     return {
         "detected_language": detected_lang,
         "text_used_for_classification": combined_text,
@@ -215,7 +135,6 @@ def smishing_detector(text, image):
         "smishing_keywords_found": found_smishing,
         "other_scam_keywords_found": found_other_scam,
         "urls_found": found_urls,
-        "threat_analysis": threat_analysis,
     }
 demo = gr.Interface(
@@ -232,14 +151,15 @@ demo = gr.Interface(
         )
     ],
     outputs="json",
-    title="SMiShing & Scam Detector with Safe Browsing",
     description="""
 This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
 (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
-It uses SHAP for explainability and checks URLs against Google's Safe Browsing API for enhanced analysis.
-    """,
-    flagging_mode="never"
 )
 if __name__ == "__main__":
-    demo.launch()

 import re
 from langdetect import detect
 from deep_translator import GoogleTranslator
 # Translator instance
 translator = GoogleTranslator(source="auto", target="es")
 classifier = pipeline("zero-shot-classification", model=model_name)
 CANDIDATE_LABELS = ["SMiShing", "Other Scam", "Legitimate"]
 def get_keywords_by_language(text: str):
     """
     Detect language using `langdetect` and translate keywords if needed.
         "SMiShing": p_smishing,
         "Other Scam": p_other_scam,
         "Legitimate": p_legit,
+        "detected_lang": detected_lang
     }
 def smishing_detector(text, image):
     """
     Main detection function combining text and OCR.
             "label": "No text provided",
             "confidence": 0.0,
             "keywords_found": [],
+            "urls_found": []
         }
     result = classifier(
     found_smishing = [kw for kw in smishing_keys if kw in lower_text]
     found_other_scam = [kw for kw in scam_keys if kw in lower_text]
     return {
         "detected_language": detected_lang,
         "text_used_for_classification": combined_text,
         "smishing_keywords_found": found_smishing,
         "other_scam_keywords_found": found_other_scam,
         "urls_found": found_urls,
     }
 demo = gr.Interface(
         )
     ],
     outputs="json",
+    title="SMiShing & Scam Detector (Language Detection + Keyword Translation)",
     description="""
 This tool classifies messages as SMiShing, Other Scam, or Legitimate using a zero-shot model
 (joeddav/xlm-roberta-large-xnli). It automatically detects if the text is Spanish or English.
+If Spanish, it translates the English-based keyword lists to Spanish before boosting the scores.
+Any URL found further boosts SMiShing specifically.
+""",
+    allow_flagging="never"
 )
 if __name__ == "__main__":
+    demo.launch()