Spaces:

langdonholmes
/

piilo

Sleeping

App Files Files Community

langdonholmes commited on Feb 11, 2023

Commit

c30df3e

•

2 Parent(s): d6c1f97 287a33f

Merge branch 'refactor'

Browse files

Files changed (6) hide show

__pycache__/spacy_analyzer.cpython-310.pyc +0 -0
anonymize.py +0 -44
anonymizer.py +37 -0
app.py +10 -29
match_replace.py +156 -0
spacy_recognizer.py → spacy_analyzer.py +44 -12

__pycache__/spacy_analyzer.cpython-310.pyc ADDED Viewed

Binary file (4.17 kB). View file

anonymize.py DELETED Viewed

@@ -1,44 +0,0 @@
-from spacy_recognizer import CustomSpacyRecognizer
-from presidio_analyzer.nlp_engine import NlpEngineProvider
-from presidio_anonymizer import AnonymizerEngine
-from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
-from presidio_anonymizer.entities import OperatorConfig
-import pandas as pd
-from json import JSONEncoder
-import json
-import warnings
-import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-warnings.filterwarnings('ignore')
-def prepare_analyzer(configuration):
-    """Return AnalyzerEngine."""
-    spacy_recognizer = CustomSpacyRecognizer()
-    print('Hallej')
-    # Create NLP engine based on configuration
-    provider = NlpEngineProvider(nlp_configuration=configuration)
-    nlp_engine = provider.create_engine()
-    # add rule-based recognizers
-    registry = RecognizerRegistry()
-    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
-    registry.add_recognizer(spacy_recognizer)
-    # remove the nlp engine we passed, to use custom label mappings
-    registry.remove_recognizer("SpacyRecognizer")
-    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
-                              registry=registry,
-                              supported_languages=["en"])
-    return analyzer
-def generate_surrogate(name):
-    """Return appropriate surrogate name from text string"""
-    if "John" in name:
-        return "Jill"
-    else:
-        return "SURROGATE_NAME"

anonymizer.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from presidio_analyzer import RecognizerResult
+def retrieve_name_records():
+    """Read in a table of names with gender and country code fields."""
+    pass
+def generate_surrogate(name):
+    """Return appropriate surrogate name from text string"""
+    if "John" in name:
+        return "Jill"
+    else:
+        return "SURROGATE_NAME"
+def anonymize(
+    anonymizer: AnonymizerEngine,
+    text: str,
+    analyze_results: list[RecognizerResult]
+    ):
+    """Anonymize identified input using Presidio Anonymizer."""
+    if not text:
+        return
+    res = anonymizer.anonymize(
+        text,
+        analyze_results,
+        operators={
+            "STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate}),
+            "EMAIL_ADDRESS": OperatorConfig("replace",  {"new_value": "[email protected]"}),
+            "PHONE_NUMBER": OperatorConfig("replace",  {"new_value": "888-888-8888"}),
+            "URL": OperatorConfig("replace",  {"new_value": "aol.com"}),
+            }
+    )
+    return res.text

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Streamlit app for Student Name Detection models."""
-from anonymize import prepare_analyzer, generate_surrogate
 from presidio_anonymizer import AnonymizerEngine
-from presidio_anonymizer.entities import OperatorConfig
 import pandas as pd
 from annotated_text import annotated_text
 from json import JSONEncoder
@@ -17,7 +17,7 @@ warnings.filterwarnings('ignore')
 # Helper methods
 @st.cache(allow_output_mutation=True)
 def analyzer_engine():
-    """Return AnalyzerEngine."""
     configuration = {
         "nlp_engine_name": "spacy",
@@ -34,27 +34,6 @@ def anonymizer_engine():
     """Return AnonymizerEngine."""
     return AnonymizerEngine()
-def get_supported_entities():
-    """Return supported entities from the Analyzer Engine."""
-    return analyzer_engine().get_supported_entities()
-def analyze(**kwargs):
-    """Analyze input using Analyzer engine and input arguments (kwargs)."""
-    if "entities" not in kwargs or "All" in kwargs["entities"]:
-        kwargs["entities"] = None
-    return analyzer_engine().analyze(**kwargs)
-def anonymize(text, analyze_results):
-    """Anonymize identified input using Presidio Anonymizer."""
-    if not text:
-        return
-    res = anonymizer_engine().anonymize(
-        text,
-        analyze_results,
-        operators={"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate})}
-    )
-    return res.text
 def annotate(text, st_analyze_results, st_entities):
     tokens = []
     # sort by start index
@@ -85,8 +64,8 @@ st.sidebar.markdown(
 st_entities = st.sidebar.multiselect(
     label="Which entities to look for?",
-    options=get_supported_entities(),
-    default=list(get_supported_entities()),
 )
 st_threshold = st.sidebar.slider(
@@ -122,7 +101,7 @@ if 'first_load' not in st.session_state:
 st.subheader("Analyzed")
 with st.spinner("Analyzing..."):
     if button or st.session_state.first_load:
-        st_analyze_results = analyze(
             text=st_text,
             entities=st_entities,
             language="en",
@@ -140,9 +119,11 @@ st.subheader("Anonymized")
 with st.spinner("Anonymizing..."):
     if button or st.session_state.first_load:
-        st_anonymize_results = anonymize(st_text, st_analyze_results)
         st_anonymize_results
 # table result
 st.subheader("Detailed Findings")
 if st_analyze_results:

 """Streamlit app for Student Name Detection models."""
+from spacy_analyzer import prepare_analyzer
+from anonymizer import anonymize
 from presidio_anonymizer import AnonymizerEngine
 import pandas as pd
 from annotated_text import annotated_text
 from json import JSONEncoder
 # Helper methods
 @st.cache(allow_output_mutation=True)
 def analyzer_engine():
+    """Return AnalyzerEngine and cache with Streamlit."""
     configuration = {
         "nlp_engine_name": "spacy",
     """Return AnonymizerEngine."""
     return AnonymizerEngine()
 def annotate(text, st_analyze_results, st_entities):
     tokens = []
     # sort by start index
 st_entities = st.sidebar.multiselect(
     label="Which entities to look for?",
+    options=analyzer_engine().get_supported_entities(),
+    default=list(analyzer_engine().get_supported_entities()),
 )
 st_threshold = st.sidebar.slider(
 st.subheader("Analyzed")
 with st.spinner("Analyzing..."):
     if button or st.session_state.first_load:
+        st_analyze_results = analyzer_engine().analyze(
             text=st_text,
             entities=st_entities,
             language="en",
 with st.spinner("Anonymizing..."):
     if button or st.session_state.first_load:
+        st_anonymize_results = anonymize(anonymizer_engine(),
+                                         st_text,
+                                         st_analyze_results)
         st_anonymize_results
 # table result
 st.subheader("Detailed Findings")
 if st_analyze_results:

match_replace.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import pandas as pd
+def replace_name_old(country_code, gender, f_l, original_name, fb_df):
+    """
+    Receiving country, gender, first_last name, and the original name.
+    Match with a name that matches gender and country, and is randomly retrieved from the
+    facebook dataset.
+    Compare the surrogate name with the original name to make sure they are different.
+    Return the surrogate name in a form of string.
+    f_l: F or L for first or last name -> str
+    """
+    # prioritizing GENDER over country?
+    # it is a very large dataset so can take long, how to improve the speed?
+    # Q: If want to get a whole name at a time? (just combining)
+    # Q: If only get initials? (change to other letters which should be easy)
+    # translating gender code
+    ###### randomly find a match in the data set! And a return a similar one
+    # if gender == 'male':
+    #     gender = 'M'
+    # elif gender == 'female':
+    #     gender = 'F'
+    # else:
+    #     gender = None
+    surrogate_name = original_name
+    # checking whether the surrogate name and the original name is the same
+    # using the while loop
+    # TODO: [Old version] the order of gender and country need to be changed
+    while(surrogate_name == original_name):
+        # situation when gender can be matched
+        if not gender:
+            gender_df = fb_df[fb_df["gender"] == gender]
+            gender_c_df = gender_df[gender_df["country"] == country_code]
+            # situations: whether country code can be matched
+            if gender_c_df.shape[0] > 0:
+                surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
+            # if gender match, country not match: randomly return from gender df
+            else:
+                surrogate_name = gender_df[f_l].sample(n=1).to_string()
+        else:
+            # situation when gender cannot be match: gender is None
+            country_df = fb_df[fb_df["country"] == country_code]
+            # situation when country can be matched
+            if country_df.shape[0] > 0:
+                surrogate_name = country_df[f_l].sample(n=1).to_string()
+            # situation when neither gender nor country can be matched
+            # randomly return one name from the whole dataset
+            else:
+                surrogate_name = fb_df[f_l].sample(n=1).to_string()
+    return surrogate_name
+def match_entity(original_info, entity):
+    # TODO: need refinement for each kind of entity
+    if entity == 'STUDENT':
+    # TODO: here, change between 1 and 2
+        return match_name_2(original_info)
+    elif entity == 'EMAIL_ADDRESS':
+        return '[email protected]'
+    elif entity == 'PHONE_NUMBER':
+        #TODO: specific form of number will be returned for consistency
+        return '000-000-0000'
+    elif entity == 'URL':
+        return 'google.com'
+    else:
+        pass
+def match_name(original_name):
+    # FIXME: take too LONG time to run (large df used multi-times), how to improve
+    # FIXME: here we only keep the first name for now
+    # TODO: how to match both first and last? -- first name match gender, last name match country?
+    # gender is not applied to last name
+    # the name distinguished by first and last?
+    # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
+    first_name = original_name.split()[0]
+    global fb_df
+    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
+    names = fb_df[fb_df['first']==first_name]
+    if not names.empty:
+        name_df = names.sample(n=1)
+        # prevent for same name - deleting same name from df
+        new_df = fb_df[fb_df['first'] != first_name]
+        new_name = replace_name(name_df, new_df)
+        return new_name
+    else:
+        return 'Jane Doe'
+def replace_name(name_df, new_df):
+    """
+    :param name_df: df that match the original first name -> data frame
+    :param new_df: df that does not repeat with original name
+    :return: whole name: that match country & gender -> str
+    """
+    gender = name_df['gender'].to_string(index=False)
+    country = name_df['country'].to_string(index=False)
+    # match country, then match gender
+    country_df = new_df[new_df['country'] == country]
+    country_g_df = country_df[country_df['gender'] == gender]
+    first = country_g_df['first'].sample(n=1).to_string(index=False)
+    last = country_g_df['last'].sample(n=1).to_string(index=False)
+    return first+' '+last
+def match_name_2(original_name):
+    """
+    Work by match gender from first name, match country from the last name
+    :param original_name:
+    :return:
+    """
+    global fb_df
+    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
+    # FIXME: work when get a full name, may need branch to only first or last name....
+    gender = name_match_gender(original_name.split()[0])
+    print(original_name.split()[1])
+    country = name_match_country(original_name.split()[-1])
+    return replace_name_2(gender, country)
+def name_match_country(last_name):
+    names = fb_df[fb_df['last'] == last_name]
+    if not names.empty:
+        country = names['country'].sample(n=1).to_string(index=False)
+        return country
+    else:
+        return 'US'
+def name_match_gender(first_name):
+    names = fb_df[fb_df['first'] == first_name]
+    gender = names['gender'].sample(n=1).to_string(index=False)
+    return gender
+def replace_name_2(gender, country):
+    # TODO: prevent same name
+    country_df = fb_df[fb_df['country'] == country]
+    country_g_df = country_df[country_df['gender'] == gender]
+    first = country_g_df['first'].sample(n=1).to_string(index=False)
+    last = country_g_df['last'].sample(n=1).to_string(index=False)
+    full_name = first +' ' + last
+    return full_name
+def replace_text(str_list):
+    surrogate_text = ''
+    for i in str_list:
+        if isinstance(i, tuple):
+            i = match_entity(i[0], i[1])
+        surrogate_text += i
+    return surrogate_text
+if __name__ == "__main__":
+    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
+    # print(matching("PH", 'female', 'first', 'Momo', fb_df))
+    print(match_entity('Nora Wang', 'STUDENT'))

spacy_recognizer.py → spacy_analyzer.py RENAMED Viewed

@@ -1,18 +1,19 @@
-import logging
-from typing import Optional, List, Tuple, Set
 from presidio_analyzer import (
     RecognizerResult,
     LocalRecognizer,
     AnalysisExplanation,
 )
-from presidio_analyzer.nlp_engine import NlpArtifacts
-from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
 logger = logging.getLogger("presidio-analyzer")
 class CustomSpacyRecognizer(LocalRecognizer):
     ENTITIES = [
         "STUDENT",
     ]
@@ -30,9 +31,8 @@ class CustomSpacyRecognizer(LocalRecognizer):
     def __init__(
         self,
         supported_language: str = "en",
-        supported_entities: Optional[List[str]] = None,
-        check_label_groups: Optional[Tuple[Set, Set]] = None,
-        context: Optional[List[str]] = None,
         ner_strength: float = 0.85,
     ):
         self.ner_strength = ner_strength
@@ -49,7 +49,7 @@ class CustomSpacyRecognizer(LocalRecognizer):
         """Load the model, not used. Model is loaded during initialization."""
         pass
-    def get_supported_entities(self) -> List[str]:
         """
         Return supported entities by this model.
         :return: List of the supported entities.
@@ -72,8 +72,17 @@ class CustomSpacyRecognizer(LocalRecognizer):
         )
         return explanation
-    def analyze(self, text, entities, nlp_artifacts=None):  # noqa D102
         results = []
         if not nlp_artifacts:
             logger.warning("Skipping SpaCy, nlp artifacts not provided...")
             return results
@@ -107,8 +116,31 @@ class CustomSpacyRecognizer(LocalRecognizer):
     @staticmethod
     def __check_label(
-        entity: str, label: str, check_label_groups: Tuple[Set, Set]
     ) -> bool:
         return any(
             [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
-        )

 from presidio_analyzer import (
+    AnalyzerEngine,
     RecognizerResult,
+    RecognizerRegistry,
     LocalRecognizer,
     AnalysisExplanation,
 )
+from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
+from typing import Optional
+import logging
 logger = logging.getLogger("presidio-analyzer")
 class CustomSpacyRecognizer(LocalRecognizer):
     ENTITIES = [
         "STUDENT",
     ]
     def __init__(
         self,
         supported_language: str = "en",
+        supported_entities: Optional[list[str]] = None,
+        check_label_groups: Optional[tuple[set, set]] = None,
         ner_strength: float = 0.85,
     ):
         self.ner_strength = ner_strength
         """Load the model, not used. Model is loaded during initialization."""
         pass
+    def get_supported_entities(self) -> list[str]:
         """
         Return supported entities by this model.
         :return: List of the supported entities.
         )
         return explanation
+    def analyze(self,
+                text: str,
+                entities: list[str] = None,
+                nlp_artifacts: NlpArtifacts = None):
+        """Analyze input using Analyzer engine and input arguments (kwargs)."""
+        if not entities or "All" in entities:
+            entities = None
         results = []
         if not nlp_artifacts:
             logger.warning("Skipping SpaCy, nlp artifacts not provided...")
             return results
     @staticmethod
     def __check_label(
+        entity: str, label: str, check_label_groups: tuple[set, set]
     ) -> bool:
         return any(
             [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )
+def prepare_analyzer(configuration):
+    """Handle Preparation of Analyzer Engine for Presidio."""
+    spacy_recognizer = CustomSpacyRecognizer()
+    # Create NLP engine based on configuration
+    provider = NlpEngineProvider(nlp_configuration=configuration)
+    nlp_engine = provider.create_engine()
+    # add rule-based recognizers
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    registry.add_recognizer(spacy_recognizer)
+    # remove the nlp engine we passed, to use custom label mappings
+    registry.remove_recognizer("SpacyRecognizer")
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
+                              registry=registry,
+                              supported_languages=["en"])
+    return analyzer