Spaces:

langdonholmes
/

piilo

Sleeping

App Files Files Community

langdonholmes commited on Feb 12, 2023

Commit

5c59636

•

1 Parent(s): 3fde3db

refactored but still needs stress testing

Browse files

Files changed (6) hide show

spacy_analyzer.py → analyzer.py +23 -28
anonymizer.py +54 -16
app.py +34 -34
data/{ascii_fb_names_small.parquet → ascii_names.parquet} +0 -0
match_replace.py +0 -117
names_database.py +21 -7

spacy_analyzer.py → analyzer.py RENAMED Viewed

@@ -1,36 +1,31 @@
-from presidio_analyzer import (
-    AnalyzerEngine,
-    RecognizerResult,
-    RecognizerRegistry,
-    LocalRecognizer,
-    AnalysisExplanation,
-)
-from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
 from typing import Optional
-import logging
-logger = logging.getLogger("presidio-analyzer")
 class CustomSpacyRecognizer(LocalRecognizer):
     ENTITIES = [
-        "STUDENT",
     ]
-    DEFAULT_EXPLANATION = "Identified as {} by a Student Name Detection Model"
     CHECK_LABEL_GROUPS = [
-        ({"STUDENT"}, {"STUDENT"}),
     ]
     MODEL_LANGUAGES = {
-        "en": "langdonholmes/en_student_name_detector",
     }
     def __init__(
         self,
-        supported_language: str = "en",
         supported_entities: Optional[list[str]] = None,
         check_label_groups: Optional[tuple[set, set]] = None,
         ner_strength: float = 0.85,
@@ -46,25 +41,25 @@ class CustomSpacyRecognizer(LocalRecognizer):
         )
     def load(self) -> None:
-        """Load the model, not used. Model is loaded during initialization."""
         pass
     def get_supported_entities(self) -> list[str]:
-        """
         Return supported entities by this model.
         :return: List of the supported entities.
-        """
         return self.supported_entities
     def build_spacy_explanation(
         self, original_score: float, explanation: str
     ) -> AnalysisExplanation:
-        """
         Create explanation for why this result was detected.
         :param original_score: Score given by this recognizer
         :param explanation: Explanation string
         :return:
-        """
         explanation = AnalysisExplanation(
             recognizer=self.__class__.__name__,
             original_score=original_score,
@@ -76,15 +71,15 @@ class CustomSpacyRecognizer(LocalRecognizer):
                 text: str,
                 entities: list[str] = None,
                 nlp_artifacts: NlpArtifacts = None):
-        """Analyze input using Analyzer engine and input arguments (kwargs)."""
-        if not entities or "All" in entities:
             entities = None
         results = []
         if not nlp_artifacts:
-            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
             return results
         ner_entities = nlp_artifacts.entities
@@ -123,7 +118,7 @@ class CustomSpacyRecognizer(LocalRecognizer):
         )
 def prepare_analyzer(configuration):
-    """Handle Preparation of Analyzer Engine for Presidio."""
     spacy_recognizer = CustomSpacyRecognizer()
@@ -137,10 +132,10 @@ def prepare_analyzer(configuration):
     registry.add_recognizer(spacy_recognizer)
     # remove the nlp engine we passed, to use custom label mappings
-    registry.remove_recognizer("SpacyRecognizer")
     analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                               registry=registry,
-                              supported_languages=["en"])
     return analyzer

+import logging
 from typing import Optional
+from presidio_analyzer import (AnalysisExplanation, AnalyzerEngine,
+                               LocalRecognizer, RecognizerRegistry,
+                               RecognizerResult)
+from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
+logger = logging.getLogger('presidio-analyzer')
 class CustomSpacyRecognizer(LocalRecognizer):
     ENTITIES = [
+        'STUDENT',
     ]
+    DEFAULT_EXPLANATION = 'Identified as {} by a Student Name Detection Model'
     CHECK_LABEL_GROUPS = [
+        ({'STUDENT'}, {'STUDENT'}),
     ]
     MODEL_LANGUAGES = {
+        'en': 'langdonholmes/en_student_name_detector',
     }
     def __init__(
         self,
+        supported_language: str = 'en',
         supported_entities: Optional[list[str]] = None,
         check_label_groups: Optional[tuple[set, set]] = None,
         ner_strength: float = 0.85,
         )
     def load(self) -> None:
+        '''Load the model, not used. Model is loaded during initialization.'''
         pass
     def get_supported_entities(self) -> list[str]:
+        '''
         Return supported entities by this model.
         :return: List of the supported entities.
+        '''
         return self.supported_entities
     def build_spacy_explanation(
         self, original_score: float, explanation: str
     ) -> AnalysisExplanation:
+        '''
         Create explanation for why this result was detected.
         :param original_score: Score given by this recognizer
         :param explanation: Explanation string
         :return:
+        '''
         explanation = AnalysisExplanation(
             recognizer=self.__class__.__name__,
             original_score=original_score,
                 text: str,
                 entities: list[str] = None,
                 nlp_artifacts: NlpArtifacts = None):
+        '''Analyze input using Analyzer engine and input arguments (kwargs).'''
+        if not entities or 'All' in entities:
             entities = None
         results = []
         if not nlp_artifacts:
+            logger.warning('Skipping SpaCy, nlp artifacts not provided...')
             return results
         ner_entities = nlp_artifacts.entities
         )
 def prepare_analyzer(configuration):
+    '''Handle Preparation of Analyzer Engine for Presidio.'''
     spacy_recognizer = CustomSpacyRecognizer()
     registry.add_recognizer(spacy_recognizer)
     # remove the nlp engine we passed, to use custom label mappings
+    registry.remove_recognizer('SpacyRecognizer')
     analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                               registry=registry,
+                              supported_languages=['en'])
     return analyzer

anonymizer.py CHANGED Viewed

@@ -1,24 +1,55 @@
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
-from presidio_analyzer import RecognizerResult
-def retrieve_name_records():
-    """Read in a table of names with gender and country code fields."""
-    pass
-def generate_surrogate(name):
-    """Return appropriate surrogate name from text string"""
-    if "John" in name:
-        return "Jill"
-    else:
-        return "SURROGATE_NAME"
 def anonymize(
     anonymizer: AnonymizerEngine,
     text: str,
     analyze_results: list[RecognizerResult]
     ):
-    """Anonymize identified input using Presidio Anonymizer."""
     if not text:
         return
@@ -27,11 +58,18 @@ def anonymize(
         text,
         analyze_results,
         operators={
-            "STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate}),
-            "EMAIL_ADDRESS": OperatorConfig("replace",  {"new_value": "[email protected]"}),
-            "PHONE_NUMBER": OperatorConfig("replace",  {"new_value": "888-888-8888"}),
-            "URL": OperatorConfig("replace",  {"new_value": "aol.com"}),
             }
     )
-    return res.text

+from presidio_analyzer import RecognizerResult
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
+from names_database import NameDatabase
+names_db = NameDatabase()
+def split_name(original_name: str):
+    '''Splits name into parts.
+    If one token, assume it is a first name.
+    If two tokens, first and last name.
+    If three tokens, one first name and two last names.
+    If four tokens, two first names and two last names.'''
+    match original_name.split():
+        case [first]:
+            return first, None
+        case [first, last]:
+            return first, last
+        case [first, last_1, last_2]:
+            return first, ' '.join((last_1, last_2))
+        case [first_1, first_2, last_1, last_2]:
+            return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
+        case _:
+            return None, None
+def generate_surrogate(original_name: str):
+    '''Generate a surrogate name.
+    '''
+    first_names, last_names = split_name(original_name)
+    gender = names_db.get_gender(first_names) if first_names else None
+    country = names_db.get_country(last_names) if last_names else None
+    surrogate_name = ''
+    name_candidates = names_db.get_random_name(
+        gender=gender,
+        country=country)
+    surrogate_name += name_candidates.iloc[0]['first']
+    if last_names:
+        surrogate_name += ' ' + name_candidates.iloc[1]['last']
+    return surrogate_name
 def anonymize(
     anonymizer: AnonymizerEngine,
     text: str,
     analyze_results: list[RecognizerResult]
     ):
+    '''Anonymize identified input using Presidio Anonymizer.'''
     if not text:
         return
         text,
         analyze_results,
         operators={
+            'STUDENT': OperatorConfig('custom',
+                                      {'lambda': generate_surrogate}),
+            'EMAIL_ADDRESS': OperatorConfig('replace',
+                                            {'new_value': 'janedoe@aol.com'}),
+            'PHONE_NUMBER': OperatorConfig('replace',
+                                           {'new_value': '888-888-8888'}),
+            'URL': OperatorConfig('replace',
+                                  {'new_value': 'aol.com'}),
             }
     )
+    return res.text
+if __name__ == '__main__':
+    print(generate_surrogate('Nora Wang'))

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
-"""Streamlit app for Student Name Detection models."""
-from spacy_analyzer import prepare_analyzer
 from anonymizer import anonymize
 from presidio_anonymizer import AnonymizerEngine
 import pandas as pd
@@ -11,18 +11,18 @@ import json
 import warnings
 import streamlit as st
 import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings('ignore')
 # Helper methods
 @st.cache(allow_output_mutation=True)
 def analyzer_engine():
-    """Return AnalyzerEngine and cache with Streamlit."""
     configuration = {
-        "nlp_engine_name": "spacy",
-        "models": [
-            {"lang_code": "en", "model_name": "en_student_name_detector"}],
     }
     analyzer = prepare_analyzer(configuration)
@@ -31,7 +31,7 @@ def analyzer_engine():
 @st.cache(allow_output_mutation=True)
 def anonymizer_engine():
-    """Return AnonymizerEngine."""
     return AnonymizerEngine()
 def annotate(text, st_analyze_results, st_entities):
@@ -54,57 +54,57 @@ def annotate(text, st_analyze_results, st_entities):
     return tokens
-st.set_page_config(page_title="Student Name Detector (English)", layout="wide")
 # Side bar
 st.sidebar.markdown(
-    """Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected by Coursera.
-"""
 )
 st_entities = st.sidebar.multiselect(
-    label="Which entities to look for?",
     options=analyzer_engine().get_supported_entities(),
     default=list(analyzer_engine().get_supported_entities()),
 )
 st_threshold = st.sidebar.slider(
-    label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
 )
 st_return_decision_process = st.sidebar.checkbox(
-    "Add analysis explanations in json")
 st.sidebar.info(
-    "This is part of a deidentification project for student-generated text."
 )
 # Main panel
 analyzer_load_state = st.info(
-    "Starting Presidio analyzer and loading Longformer-based model...")
 engine = analyzer_engine()
 analyzer_load_state.empty()
 st_text = st.text_area(
-    label="Type in some text",
-    value="Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]",
     height=200,
 )
-button = st.button("Detect PII")
 if 'first_load' not in st.session_state:
     st.session_state['first_load'] = True
 # After
-st.subheader("Analyzed")
-with st.spinner("Analyzing..."):
     if button or st.session_state.first_load:
         st_analyze_results = analyzer_engine().analyze(
             text=st_text,
             entities=st_entities,
-            language="en",
             score_threshold=st_threshold,
             return_decision_process=st_return_decision_process,
         )
@@ -113,11 +113,11 @@ with st.spinner("Analyzing..."):
         annotated_text(*annotated_tokens)
 # vertical space
-st.text("")
-st.subheader("Anonymized")
-with st.spinner("Anonymizing..."):
     if button or st.session_state.first_load:
         st_anonymize_results = anonymize(anonymizer_engine(),
                                          st_text,
@@ -125,34 +125,34 @@ with st.spinner("Anonymizing..."):
         st_anonymize_results
 # table result
-st.subheader("Detailed Findings")
 if st_analyze_results:
     res_dicts = [r.to_dict() for r in st_analyze_results]
     for d in res_dicts:
         d['Value'] = st_text[d['start']:d['end']]
     df = pd.DataFrame.from_records(res_dicts)
-    df = df[["entity_type", "Value", "score", "start", "end"]].rename(
         {
-            "entity_type": "Entity type",
-            "start": "Start",
-            "end": "End",
-            "score": "Confidence",
         },
         axis=1,
     )
     st.dataframe(df, width=1000)
 else:
-    st.text("No findings")
 st.session_state['first_load'] = True
 # json result
 class ToDictListEncoder(JSONEncoder):
-    """Encode dict to json."""
     def default(self, o):
-        """Encode to JSON using to_dict."""
         if o:
             return o.to_dict()
         return []

+'''Streamlit app for Student Name Detection models.'''
+from analyzer import prepare_analyzer
 from anonymizer import anonymize
 from presidio_anonymizer import AnonymizerEngine
 import pandas as pd
 import warnings
 import streamlit as st
 import os
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 warnings.filterwarnings('ignore')
 # Helper methods
 @st.cache(allow_output_mutation=True)
 def analyzer_engine():
+    '''Return AnalyzerEngine and cache with Streamlit.'''
     configuration = {
+        'nlp_engine_name': 'spacy',
+        'models': [
+            {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
     }
     analyzer = prepare_analyzer(configuration)
 @st.cache(allow_output_mutation=True)
 def anonymizer_engine():
+    '''Return AnonymizerEngine.'''
     return AnonymizerEngine()
 def annotate(text, st_analyze_results, st_entities):
     return tokens
+st.set_page_config(page_title='Student Name Detector (English)', layout='wide')
 # Side bar
 st.sidebar.markdown(
+    '''Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected by Coursera.
+'''
 )
 st_entities = st.sidebar.multiselect(
+    label='Which entities to look for?',
     options=analyzer_engine().get_supported_entities(),
     default=list(analyzer_engine().get_supported_entities()),
 )
 st_threshold = st.sidebar.slider(
+    label='Acceptance threshold', min_value=0.0, max_value=1.0, value=0.35
 )
 st_return_decision_process = st.sidebar.checkbox(
+    'Add analysis explanations in json')
 st.sidebar.info(
+    'This is part of a deidentification project for student-generated text.'
 )
 # Main panel
 analyzer_load_state = st.info(
+    'Starting Presidio analyzer and loading Longformer-based model...')
 engine = analyzer_engine()
 analyzer_load_state.empty()
 st_text = st.text_area(
+    label='Type in some text',
+    value='Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]',
     height=200,
 )
+button = st.button('Detect PII')
 if 'first_load' not in st.session_state:
     st.session_state['first_load'] = True
 # After
+st.subheader('Analyzed')
+with st.spinner('Analyzing...'):
     if button or st.session_state.first_load:
         st_analyze_results = analyzer_engine().analyze(
             text=st_text,
             entities=st_entities,
+            language='en',
             score_threshold=st_threshold,
             return_decision_process=st_return_decision_process,
         )
         annotated_text(*annotated_tokens)
 # vertical space
+st.text('')
+st.subheader('Anonymized')
+with st.spinner('Anonymizing...'):
     if button or st.session_state.first_load:
         st_anonymize_results = anonymize(anonymizer_engine(),
                                          st_text,
         st_anonymize_results
 # table result
+st.subheader('Detailed Findings')
 if st_analyze_results:
     res_dicts = [r.to_dict() for r in st_analyze_results]
     for d in res_dicts:
         d['Value'] = st_text[d['start']:d['end']]
     df = pd.DataFrame.from_records(res_dicts)
+    df = df[['entity_type', 'Value', 'score', 'start', 'end']].rename(
         {
+            'entity_type': 'Entity type',
+            'start': 'Start',
+            'end': 'End',
+            'score': 'Confidence',
         },
         axis=1,
     )
     st.dataframe(df, width=1000)
 else:
+    st.text('No findings')
 st.session_state['first_load'] = True
 # json result
 class ToDictListEncoder(JSONEncoder):
+    '''Encode dict to json.'''
     def default(self, o):
+        '''Encode to JSON using to_dict.'''
         if o:
             return o.to_dict()
         return []

data/{ascii_fb_names_small.parquet → ascii_names.parquet} RENAMED Viewed

File without changes

match_replace.py DELETED Viewed

@@ -1,117 +0,0 @@
-import pandas as pd
-from names_database import NameDatabase
-names_db = NameDatabase
-def describe_name(first_names, last_names):
-    gender = names_db.get_gender() if first_names else None
-    country = names_db.get_country() if last_names else None
-    return gender, country
-def split_name(all_names):
-    '''Splits name into parts.
-    If one token, assume it is a first name.
-    If two tokens, first and last name.
-    If three tokens, one first name and two last names.
-    If four tokens, two first names and two last names.'''
-    match all_names.split():
-        case [first]:
-            return first, None
-        case [first, last]:
-            return first, last
-        case [first, last_1, last_2]:
-            return first, ' '.join((last_1, last_2))
-        case [first_1, first_2, last_1, last_2]:
-            return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
-        case _:
-            return None, None
-def match_name(original_name):
-    # FIXME: take too LONG time to run (large df used multi-times), how to improve
-    # FIXME: here we only keep the first name for now
-    # TODO: how to match both first and last? -- first name match gender, last name match country?
-    # gender is not applied to last name
-    # the name distinguished by first and last?
-    # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
-    first_name = original_name.split()[0]
-    global fb_df
-    names = fb_df[fb_df['first']==first_name]
-    if not names.empty:
-        name_df = names.sample(n=1)
-        # prevent for same name - deleting same name from df
-        new_df = fb_df[fb_df['first'] != first_name]
-        new_name = replace_name(name_df, new_df)
-        return new_name
-    else:
-        return 'Jane Doe'
-def replace_name(name_df, new_df):
-    """
-    :param name_df: df that match the original first name -> data frame
-    :param new_df: df that does not repeat with original name
-    :return: whole name: that match country & gender -> str
-    """
-    gender = name_df['gender'].to_string(index=False)
-    country = name_df['country'].to_string(index=False)
-    # match country, then match gender
-    country_df = new_df[new_df['country'] == country]
-    country_g_df = country_df[country_df['gender'] == gender]
-    first = country_g_df['first'].sample(n=1).to_string(index=False)
-    last = country_g_df['last'].sample(n=1).to_string(index=False)
-    return first+' '+last
-def match_name_2(original_name):
-    """
-    Work by match gender from first name, match country from the last name
-    :param original_name:
-    :return:
-    """
-    global fb_df
-    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
-    # FIXME: work when get a full name, may need branch to only first or last name....
-    gender = name_match_gender(original_name.split()[0])
-    print(original_name.split()[1])
-    country = name_match_country(original_name.split()[-1])
-    return replace_name_2(gender, country)
-def name_match_country(last_name):
-    names = fb_df[fb_df['last'] == last_name]
-    if not names.empty:
-        country = names['country'].sample(n=1).to_string(index=False)
-        return country
-    else:
-        return 'US'
-def name_match_gender(first_name):
-    names = fb_df[fb_df['first'] == first_name]
-    gender = names['gender'].sample(n=1).to_string(index=False)
-    return gender
-def replace_name_2(gender, country):
-    # TODO: prevent same name
-    country_df = fb_df[fb_df['country'] == country]
-    country_g_df = country_df[country_df['gender'] == gender]
-    first = country_g_df['first'].sample(n=1).to_string(index=False)
-    last = country_g_df['last'].sample(n=1).to_string(index=False)
-    full_name = first +' ' + last
-    return full_name
-def replace_text(str_list):
-    surrogate_text = ''
-    for i in str_list:
-        if isinstance(i, tuple):
-            i = match_entity(i[0], i[1])
-        surrogate_text += i
-    return surrogate_text
-if __name__ == "__main__":
-    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
-    # print(matching("PH", 'female', 'first', 'Momo', fb_df))
-    print(match_entity('Nora Wang', 'STUDENT'))

names_database.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from names_dataset import NameDataset, NameWrapper
 from typing import Optional
 class NameDatabase(NameDataset):
     def __init__(self) -> None:
         super().__init__()
-        self.names = pd.read_parquet('ascii_fb_names_small.parquet')
     def get_random_name(
             self,
@@ -12,17 +17,26 @@ class NameDatabase(NameDataset):
             gender: Optional[str] = None
     ):
         '''country: ISO country code in 'alpha 2' format
-        gender: "M" or "F"
         '''
         names_view = self.names
         if country:
             names_view = names_view[names_view['country'] == country]
         if gender:
             names_view = names_view[names_view['gender'] == gender]
-        return names_view.sample(weights=names_view.count)
-    def get_gender(first_names: str):
         return NameWrapper(self.search(first_names)).gender
-    def get_country(last_names: str):
         return NameWrapper(self.search(last_names)).country

+from pathlib import Path
 from typing import Optional
+import pandas as pd
+from names_dataset import NameDataset, NameWrapper
+name_table = Path('data', 'ascii_names.parquet')
 class NameDatabase(NameDataset):
     def __init__(self) -> None:
         super().__init__()
+        self.names = pd.read_parquet(name_table)
     def get_random_name(
             self,
             gender: Optional[str] = None
     ):
         '''country: ISO country code in 'alpha 2' format
+        gender: 'M' or 'F'
+        returns two rows of the names dataframe
         '''
         names_view = self.names
         if country:
             names_view = names_view[names_view['country'] == country]
         if gender:
             names_view = names_view[names_view['gender'] == gender]
+        if names_view.size < 25:
+            return self.names.sample(n=2, weights=self.names['count'])
+        return names_view.sample(n=2, weights=names_view['count'])
+    def search(self, name: str):
+        key = name.strip().title()
+        fn = self.first_names.get(key) if self.first_names is not None else None
+        ln = self.last_names.get(key) if self.last_names is not None else None
+        return {'first_name': fn, 'last_name': ln}
+    def get_gender(self, first_names: str):
         return NameWrapper(self.search(first_names)).gender
+    def get_country(self, last_names: str):
         return NameWrapper(self.search(last_names)).country