langdonholmes commited on
Commit
5806da1
β€’
1 Parent(s): b97a311

chasing streamlit bug

Browse files
__pycache__/spacy_analyzer.cpython-310.pyc ADDED
Binary file (4.17 kB). View file
 
anonymize.py CHANGED
@@ -1,44 +1,32 @@
1
- from spacy_recognizer import CustomSpacyRecognizer
2
- from presidio_analyzer.nlp_engine import NlpEngineProvider
3
  from presidio_anonymizer import AnonymizerEngine
4
- from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
5
  from presidio_anonymizer.entities import OperatorConfig
6
- import pandas as pd
7
- from json import JSONEncoder
8
- import json
9
- import warnings
10
- import os
11
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
- warnings.filterwarnings('ignore')
13
 
14
- def prepare_analyzer(configuration):
15
- """Return AnalyzerEngine."""
16
-
17
- spacy_recognizer = CustomSpacyRecognizer()
18
-
19
- print('Hallej')
20
-
21
- # Create NLP engine based on configuration
22
- provider = NlpEngineProvider(nlp_configuration=configuration)
23
- nlp_engine = provider.create_engine()
24
-
25
- # add rule-based recognizers
26
- registry = RecognizerRegistry()
27
- registry.load_predefined_recognizers(nlp_engine=nlp_engine)
28
- registry.add_recognizer(spacy_recognizer)
29
-
30
- # remove the nlp engine we passed, to use custom label mappings
31
- registry.remove_recognizer("SpacyRecognizer")
32
-
33
- analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
34
- registry=registry,
35
- supported_languages=["en"])
36
-
37
- return analyzer
38
 
39
  def generate_surrogate(name):
40
  """Return appropriate surrogate name from text string"""
41
  if "John" in name:
42
  return "Jill"
43
  else:
44
- return "SURROGATE_NAME"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from presidio_anonymizer import AnonymizerEngine
 
2
  from presidio_anonymizer.entities import OperatorConfig
3
+ from presidio_analyzer import RecognizerResult
 
 
 
 
 
 
4
 
5
+ def retrieve_name_records():
6
+ """Read in a table of names with gender and country code fields."""
7
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def generate_surrogate(name):
10
  """Return appropriate surrogate name from text string"""
11
  if "John" in name:
12
  return "Jill"
13
  else:
14
+ return "SURROGATE_NAME"
15
+
16
+ def anonymize(
17
+ anonymizer: AnonymizerEngine,
18
+ text: str,
19
+ analyze_results: list[RecognizerResult]
20
+ ):
21
+ """Anonymize identified input using Presidio Anonymizer."""
22
+
23
+ if not text:
24
+ return
25
+
26
+ res = anonymizer.anonymize(
27
+ text,
28
+ analyze_results,
29
+ operators={"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate})}
30
+ )
31
+
32
+ return res.text
app.py CHANGED
@@ -1,9 +1,8 @@
1
 
2
  """Streamlit app for Student Name Detection models."""
3
 
4
- from anonymize import prepare_analyzer, generate_surrogate
5
  from presidio_anonymizer import AnonymizerEngine
6
- from presidio_anonymizer.entities import OperatorConfig
7
  import pandas as pd
8
  from annotated_text import annotated_text
9
  from json import JSONEncoder
@@ -17,7 +16,7 @@ warnings.filterwarnings('ignore')
17
  # Helper methods
18
  @st.cache(allow_output_mutation=True)
19
  def analyzer_engine():
20
- """Return AnalyzerEngine."""
21
 
22
  configuration = {
23
  "nlp_engine_name": "spacy",
@@ -34,28 +33,7 @@ def anonymizer_engine():
34
  """Return AnonymizerEngine."""
35
  return AnonymizerEngine()
36
 
37
- def get_supported_entities():
38
- """Return supported entities from the Analyzer Engine."""
39
- return analyzer_engine().get_supported_entities()
40
-
41
- def analyze(**kwargs):
42
- """Analyze input using Analyzer engine and input arguments (kwargs)."""
43
- if "entities" not in kwargs or "All" in kwargs["entities"]:
44
- kwargs["entities"] = None
45
- return analyzer_engine().analyze(**kwargs)
46
-
47
- def anonymize(text, analyze_results):
48
- """Anonymize identified input using Presidio Anonymizer."""
49
- if not text:
50
- return
51
- res = anonymizer_engine().anonymize(
52
- text,
53
- analyze_results,
54
- operators={"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate})}
55
- )
56
- return res.text
57
-
58
- def annotate(text, st_analyze_results, st_entities):
59
  tokens = []
60
  # sort by start index
61
  results = sorted(st_analyze_results, key=lambda x: x.start)
 
1
 
2
  """Streamlit app for Student Name Detection models."""
3
 
4
+ from spacy_analyzer import prepare_analyzer
5
  from presidio_anonymizer import AnonymizerEngine
 
6
  import pandas as pd
7
  from annotated_text import annotated_text
8
  from json import JSONEncoder
 
16
  # Helper methods
17
  @st.cache(allow_output_mutation=True)
18
  def analyzer_engine():
19
+ """Return AnalyzerEngine and cache with Streamlit."""
20
 
21
  configuration = {
22
  "nlp_engine_name": "spacy",
 
33
  """Return AnonymizerEngine."""
34
  return AnonymizerEngine()
35
 
36
+ def annotate(text, st_analyze_results):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  tokens = []
38
  # sort by start index
39
  results = sorted(st_analyze_results, key=lambda x: x.start)
match_replace.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def replace_name_old(country_code, gender, f_l, original_name, fb_df):
4
+ """
5
+ Receiving country, gender, first_last name, and the original name.
6
+ Match with a name that matches gender and country, and is randomly retrieved from the
7
+ facebook dataset.
8
+ Compare the surrogate name with the original name to make sure they are different.
9
+ Return the surrogate name in a form of string.
10
+ f_l: F or L for first or last name -> str
11
+ """
12
+ # prioritizing GENDER over country?
13
+ # it is a very large dataset so can take long, how to improve the speed?
14
+ # Q: If want to get a whole name at a time? (just combining)
15
+ # Q: If only get initials? (change to other letters which should be easy)
16
+ # translating gender code
17
+ ###### randomly find a match in the data set! And a return a similar one
18
+ # if gender == 'male':
19
+ # gender = 'M'
20
+ # elif gender == 'female':
21
+ # gender = 'F'
22
+ # else:
23
+ # gender = None
24
+
25
+ surrogate_name = original_name
26
+ # checking whether the surrogate name and the original name is the same
27
+ # using the while loop
28
+ # TODO: [Old version] the order of gender and country need to be changed
29
+ while(surrogate_name == original_name):
30
+ # situation when gender can be matched
31
+ if not gender:
32
+ gender_df = fb_df[fb_df["gender"] == gender]
33
+ gender_c_df = gender_df[gender_df["country"] == country_code]
34
+ # situations: whether country code can be matched
35
+ if gender_c_df.shape[0] > 0:
36
+ surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
37
+ # if gender match, country not match: randomly return from gender df
38
+ else:
39
+ surrogate_name = gender_df[f_l].sample(n=1).to_string()
40
+ else:
41
+ # situation when gender cannot be match: gender is None
42
+ country_df = fb_df[fb_df["country"] == country_code]
43
+ # situation when country can be matched
44
+ if country_df.shape[0] > 0:
45
+ surrogate_name = country_df[f_l].sample(n=1).to_string()
46
+ # situation when neither gender nor country can be matched
47
+ # randomly return one name from the whole dataset
48
+ else:
49
+ surrogate_name = fb_df[f_l].sample(n=1).to_string()
50
+
51
+ return surrogate_name
52
+
53
+ def match_entity(original_info, entity):
54
+ # TODO: need refinement for each kind of entity
55
+ if entity == 'STUDENT':
56
+ # TODO: here, change between 1 and 2
57
+ return match_name_2(original_info)
58
+ elif entity == 'EMAIL_ADDRESS':
59
+ return '[email protected]'
60
+ elif entity == 'PHONE_NUMBER':
61
+ #TODO: specific form of number will be returned for consistency
62
+ return '000-000-0000'
63
+ elif entity == 'URL':
64
+ return 'google.com'
65
+ else:
66
+ pass
67
+
68
+ def match_name(original_name):
69
+ # FIXME: take too LONG time to run (large df used multi-times), how to improve
70
+ # FIXME: here we only keep the first name for now
71
+ # TODO: how to match both first and last? -- first name match gender, last name match country?
72
+ # gender is not applied to last name
73
+ # the name distinguished by first and last?
74
+ # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
75
+ first_name = original_name.split()[0]
76
+ global fb_df
77
+ fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
78
+ names = fb_df[fb_df['first']==first_name]
79
+ if not names.empty:
80
+ name_df = names.sample(n=1)
81
+ # prevent for same name - deleting same name from df
82
+ new_df = fb_df[fb_df['first'] != first_name]
83
+ new_name = replace_name(name_df, new_df)
84
+ return new_name
85
+ else:
86
+ return 'Jane Doe'
87
+
88
+ def replace_name(name_df, new_df):
89
+ """
90
+ :param name_df: df that match the original first name -> data frame
91
+ :param new_df: df that does not repeat with original name
92
+ :return: whole name: that match country & gender -> str
93
+ """
94
+ gender = name_df['gender'].to_string(index=False)
95
+ country = name_df['country'].to_string(index=False)
96
+
97
+ # match country, then match gender
98
+ country_df = new_df[new_df['country'] == country]
99
+ country_g_df = country_df[country_df['gender'] == gender]
100
+
101
+ first = country_g_df['first'].sample(n=1).to_string(index=False)
102
+ last = country_g_df['last'].sample(n=1).to_string(index=False)
103
+ return first+' '+last
104
+
105
+
106
+
107
+ def match_name_2(original_name):
108
+ """
109
+ Work by match gender from first name, match country from the last name
110
+ :param original_name:
111
+ :return:
112
+ """
113
+ global fb_df
114
+ fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
115
+ # FIXME: work when get a full name, may need branch to only first or last name....
116
+ gender = name_match_gender(original_name.split()[0])
117
+ print(original_name.split()[1])
118
+ country = name_match_country(original_name.split()[-1])
119
+ return replace_name_2(gender, country)
120
+
121
+
122
+ def name_match_country(last_name):
123
+ names = fb_df[fb_df['last'] == last_name]
124
+ if not names.empty:
125
+ country = names['country'].sample(n=1).to_string(index=False)
126
+ return country
127
+ else:
128
+ return 'US'
129
+
130
+ def name_match_gender(first_name):
131
+ names = fb_df[fb_df['first'] == first_name]
132
+ gender = names['gender'].sample(n=1).to_string(index=False)
133
+ return gender
134
+
135
+ def replace_name_2(gender, country):
136
+ # TODO: prevent same name
137
+ country_df = fb_df[fb_df['country'] == country]
138
+ country_g_df = country_df[country_df['gender'] == gender]
139
+
140
+ first = country_g_df['first'].sample(n=1).to_string(index=False)
141
+ last = country_g_df['last'].sample(n=1).to_string(index=False)
142
+ full_name = first +' ' + last
143
+ return full_name
144
+
145
+ def replace_text(str_list):
146
+ surrogate_text = ''
147
+ for i in str_list:
148
+ if isinstance(i, tuple):
149
+ i = match_entity(i[0], i[1])
150
+ surrogate_text += i
151
+ return surrogate_text
152
+
153
+ if __name__ == "__main__":
154
+ fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
155
+ # print(matching("PH", 'female', 'first', 'Momo', fb_df))
156
+ print(match_entity('Nora Wang', 'STUDENT'))
spacy_recognizer.py β†’ spacy_analyzer.py RENAMED
@@ -1,18 +1,18 @@
1
- import logging
2
- from typing import Optional, List, Tuple, Set
3
 
4
  from presidio_analyzer import (
5
- RecognizerResult,
 
6
  LocalRecognizer,
7
  AnalysisExplanation,
8
  )
9
- from presidio_analyzer.nlp_engine import NlpArtifacts
10
- from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
11
 
 
 
 
 
12
  logger = logging.getLogger("presidio-analyzer")
13
 
14
  class CustomSpacyRecognizer(LocalRecognizer):
15
-
16
  ENTITIES = [
17
  "STUDENT",
18
  ]
@@ -30,9 +30,8 @@ class CustomSpacyRecognizer(LocalRecognizer):
30
  def __init__(
31
  self,
32
  supported_language: str = "en",
33
- supported_entities: Optional[List[str]] = None,
34
- check_label_groups: Optional[Tuple[Set, Set]] = None,
35
- context: Optional[List[str]] = None,
36
  ner_strength: float = 0.85,
37
  ):
38
  self.ner_strength = ner_strength
@@ -49,7 +48,7 @@ class CustomSpacyRecognizer(LocalRecognizer):
49
  """Load the model, not used. Model is loaded during initialization."""
50
  pass
51
 
52
- def get_supported_entities(self) -> List[str]:
53
  """
54
  Return supported entities by this model.
55
  :return: List of the supported entities.
@@ -72,8 +71,16 @@ class CustomSpacyRecognizer(LocalRecognizer):
72
  )
73
  return explanation
74
 
75
- def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
 
 
 
 
 
 
 
76
  results = []
 
77
  if not nlp_artifacts:
78
  logger.warning("Skipping SpaCy, nlp artifacts not provided...")
79
  return results
@@ -107,8 +114,31 @@ class CustomSpacyRecognizer(LocalRecognizer):
107
 
108
  @staticmethod
109
  def __check_label(
110
- entity: str, label: str, check_label_groups: Tuple[Set, Set]
111
  ) -> bool:
112
  return any(
113
  [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
114
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  from presidio_analyzer import (
3
+ AnalyzerEngine,
4
+ RecognizerRegistry,
5
  LocalRecognizer,
6
  AnalysisExplanation,
7
  )
 
 
8
 
9
+ from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
10
+ from typing import Optional
11
+
12
+ import logging
13
  logger = logging.getLogger("presidio-analyzer")
14
 
15
  class CustomSpacyRecognizer(LocalRecognizer):
 
16
  ENTITIES = [
17
  "STUDENT",
18
  ]
 
30
  def __init__(
31
  self,
32
  supported_language: str = "en",
33
+ supported_entities: Optional[list[str]] = None,
34
+ check_label_groups: Optional[tuple[set, set]] = None,
 
35
  ner_strength: float = 0.85,
36
  ):
37
  self.ner_strength = ner_strength
 
48
  """Load the model, not used. Model is loaded during initialization."""
49
  pass
50
 
51
+ def get_supported_entities(self) -> list[str]:
52
  """
53
  Return supported entities by this model.
54
  :return: List of the supported entities.
 
71
  )
72
  return explanation
73
 
74
+ def analyze(self,
75
+ entities: list[str] = None,
76
+ nlp_artifacts: NlpArtifacts = None):
77
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
78
+
79
+ if not entities or "All" in entities:
80
+ entities = None
81
+
82
  results = []
83
+
84
  if not nlp_artifacts:
85
  logger.warning("Skipping SpaCy, nlp artifacts not provided...")
86
  return results
 
114
 
115
  @staticmethod
116
  def __check_label(
117
+ entity: str, label: str, check_label_groups: tuple[set, set]
118
  ) -> bool:
119
  return any(
120
  [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
121
+ )
122
+
123
+ def prepare_analyzer(configuration):
124
+ """Handle Preparation of Analyzer Engine for Presidio."""
125
+
126
+ spacy_recognizer = CustomSpacyRecognizer()
127
+
128
+ # Create NLP engine based on configuration
129
+ provider = NlpEngineProvider(nlp_configuration=configuration)
130
+ nlp_engine = provider.create_engine()
131
+
132
+ # add rule-based recognizers
133
+ registry = RecognizerRegistry()
134
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
135
+ registry.add_recognizer(spacy_recognizer)
136
+
137
+ # remove the nlp engine we passed, to use custom label mappings
138
+ registry.remove_recognizer("SpacyRecognizer")
139
+
140
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
141
+ registry=registry,
142
+ supported_languages=["en"])
143
+
144
+ return analyzer