Spaces:
Sleeping
Sleeping
import re | |
from dataclasses import dataclass | |
from string import punctuation | |
import pandas as pd | |
all_punctuation = punctuation + "‘’·—»" | |
# keep in dollar signs | |
all_punctuation = all_punctuation.replace("$", "") | |
# "regex separator" | |
# captures the following: 1+ spaces OR 1+ non-word characters (ex: "/", "-"), OR 1 word boundary | |
# apply the this variable using an `fr` string in the regex substituion (ex: `fr"\bw{sep}force\b"`) | |
sep = "(?: +|\W+|\b)" | |
class RegexRemoval: | |
description: str | |
regex_str: str # usually raw string: r"your string" | |
def __post_init__(self): | |
self.regex = re.compile(self.regex_str, re.IGNORECASE) | |
class RegexSubstitution: | |
description: str | |
regex_str: str # usually raw string: r"your string" | |
replacement: str | |
priority: int = 10 # higher values → run later (eg: 1 runs before 20) | |
def __post_init__(self): | |
self.regex = re.compile(self.regex_str, re.IGNORECASE) | |
removals = [ | |
RegexRemoval("OBSCIS", r"(OBSCIS)"), | |
RegexRemoval( | |
"MO Suffix", | |
r"\b\w\s\w\s\w\w?\s\w\s\d{2}(?: |\W)\d{2}(?: |\W)\d{4}", | |
), | |
RegexRemoval( | |
"Statute Prefix", r"\S{1,2}\s\d\S{0,3}\.\d\S{0,3}\.\d\S{0,3}(?:\.\d?\S{0,3}?)?" | |
), | |
] | |
substitutions = [ | |
# LESS THAN / GREATER THAN terms ========= | |
RegexSubstitution("Less Than", fr"\b(?:<|lt)\b", " less than "), | |
RegexSubstitution("Less Than 2", fr"\blt(?=\d+)\b", "less than "), | |
RegexSubstitution("Less Than 3", fr"\<", " less than "), | |
RegexSubstitution("Greater Than", fr"\b(?:>|gt|\>)\b", " greater than "), | |
RegexSubstitution("Greater Than 2", fr"\bgt(?=\d+)\b", "greater than "), | |
RegexSubstitution("Greater Than 3", fr"\>", " greater than "), | |
# WITH terms =========== | |
RegexSubstitution("With Out", fr"\bw{sep}(?:o|out)\b", "without"), | |
RegexSubstitution("With Out 2", fr"\bwo\b", "without"), | |
RegexSubstitution("Within", fr"\bw{sep}(?:i|in)\b", "within", priority=5), | |
RegexSubstitution( | |
"With Intent", | |
fr"\bw{sep}\s?in?t?e?n?t?\b", | |
"with intent", | |
), | |
RegexSubstitution( | |
"with a", | |
fr"\bw{sep}a\b", | |
"with a", | |
), | |
RegexSubstitution( | |
"with health", | |
fr"\bw{sep}health\b", | |
"with health", | |
), | |
RegexSubstitution( | |
"with own", | |
fr"\bw{sep}own\b", | |
"with own", | |
), | |
RegexSubstitution( | |
"with report", | |
fr"\bw{sep}report\b", | |
"with report", | |
), | |
RegexSubstitution( | |
"with license", | |
fr"\bw{sep}license\b", | |
"with license", | |
), | |
RegexSubstitution( | |
"with murder", | |
fr"\bw{sep}murder\b", | |
"with murder", | |
), | |
RegexSubstitution( | |
"with injury", | |
fr"\bw{sep}(?:injury|inj|injry)\b", | |
"with injury", | |
), | |
RegexSubstitution( | |
"with turned", | |
fr"\bw{sep}turned\b", | |
"with turned", | |
), | |
RegexSubstitution( | |
"with altered", | |
fr"\bw{sep}alt\b", | |
"with altered", | |
), | |
RegexSubstitution( | |
"with deadly", | |
fr"\bw{sep}deadly\b", | |
"with deadly", | |
), | |
RegexSubstitution( | |
"with dangerous weapon", | |
fr"\b(?:with|w){sep}(?:dangerous|d){sep}(?:weapon|wpn|weapn|weap)\b", | |
"with dangerous weapon", | |
priority=5, | |
), | |
RegexSubstitution( | |
"with child", | |
fr"\b(?:with|w){sep}(?:child|chi|chld)\b", | |
"with child", | |
), | |
RegexSubstitution( | |
"with minor", | |
fr"\bw{sep}minor\b", | |
"with minor", | |
), | |
RegexSubstitution( | |
"with kidnapping", | |
fr"\bw{sep}kidnapping\b", | |
"with kidnapping", | |
), | |
RegexSubstitution( | |
"with agency", | |
fr"\bw{sep}agency\b", | |
"with agency", | |
), | |
RegexSubstitution( | |
"with firearm", | |
fr"\bw{sep}firearm\b", | |
"with firearm", | |
), | |
RegexSubstitution( | |
"with weapon", | |
fr"\bw{sep}(?:weapon|wpn|weapn|weap)\b", | |
"with weapon", | |
), | |
RegexSubstitution( | |
"with knife", | |
fr"\bw{sep}knife\b", | |
"with knife", | |
), | |
RegexSubstitution( | |
"with force", | |
fr"\bw{sep}force\b", | |
"with force", | |
), | |
RegexSubstitution( | |
"with extenuating circumstances", | |
fr"\bw{sep}ext{sep}circumstances\b", | |
"with extenuating circumstances", | |
), | |
RegexSubstitution( | |
"with prior", | |
fr"\bw{sep}prior\b", | |
"with prior", | |
), | |
RegexSubstitution( | |
"with previous", | |
fr"\bw{sep}previous\b", | |
"with previous", | |
), | |
RegexSubstitution( | |
"with domestic violence", | |
fr"\bw{sep}dv\b", | |
"with domestic violence", | |
), | |
RegexSubstitution( | |
"with suspended", | |
fr"\bw{sep}suspended\b", | |
"with suspended", | |
), | |
RegexSubstitution( # doublecheck this | |
"vehicle with", | |
fr"\bvehicle{sep}w{sep}", | |
"vehicle with", | |
), | |
RegexSubstitution( # TODO: is this "possession with" or "possession weapon"? see concealed weapon as example | |
"possession with", | |
fr"\b(?:possession|possess|poss){sep}w{sep}", | |
"possession with", | |
), | |
RegexSubstitution( | |
"possession with intent", | |
fr"\bp{sep}with{sep}intent", | |
"possession with intent", | |
priority=30, | |
), | |
RegexSubstitution( | |
"neglect with", | |
fr"\bneglect{sep}w{sep}", | |
"neglect with", | |
), | |
RegexSubstitution( | |
"cooperate with", | |
fr"\bcooperate{sep}w{sep}", | |
"cooperate with", | |
), | |
RegexSubstitution( | |
"interfere with", | |
fr"\b(?:inter|interfere){sep}w{sep}", | |
"interfere with", | |
), | |
RegexSubstitution( # TODO consolidate tamper/tampering? | |
"tamper with", | |
fr"\btamper{sep}w{sep}", | |
"tamper with", | |
), | |
RegexSubstitution( | |
"tampering with", | |
fr"\btampering{sep}w{sep}", | |
"tampering with", | |
), | |
RegexSubstitution( | |
"assault with", | |
fr"\bassault{sep}w{sep}", | |
"assault with", | |
), | |
# FIREARM TERMS | |
RegexSubstitution( | |
"firearm with altered identification numbers", | |
fr"\bfirearm{sep}(?:with|w){sep}alter\b", | |
"firearm with altered identification numbers", | |
), | |
RegexSubstitution( | |
"firearm", | |
fr"\bf{sep}a\b", | |
"firearm", | |
), | |
RegexSubstitution( | |
"intimidation", | |
fr"\b(?:intim|intimid)\b", | |
"intimidation", | |
), | |
# DOMESTIC VIOLENCE TERMS / PROTECTION / RESTRAINING ORDERS | |
RegexSubstitution( | |
"protective order", | |
fr"\b(?:protective|protection|prot){sep}(?:order|ord|or)\b", | |
"protective order", | |
), | |
RegexSubstitution( | |
"domestic violence protective order", | |
r"\bdvpo\b", | |
"domestic violence protective order", | |
), | |
RegexSubstitution("domestic", r"\bdom\b", "domestic", priority=20), | |
RegexSubstitution( | |
"domestic violence", | |
r"\bdv\b", | |
"domestic violence", | |
), | |
RegexSubstitution( | |
"domestic violence 2", | |
fr"\bd{sep}v\b", | |
"domestic violence", | |
), | |
RegexSubstitution( | |
"witness testimony", | |
fr"\bwit{sep}tes\b", | |
"witness testimony", | |
), | |
# CONVICTION TERMS == | |
RegexSubstitution( | |
"misdemeanor conviction", | |
fr"\b(?:misdemeanor|misd){sep}(?:convic|conv)\b", | |
"misdemeanor conviction", | |
), | |
RegexSubstitution( | |
"prior conviction", | |
fr"\b(?:prior|pr|pri){sep}(?:convic|conv)\b", | |
"prior conviction", | |
), | |
# ==== GENERAL TERMS ===== | |
RegexSubstitution( # NOTE: added a negative lookbehind for 'mentally' so we won't override 'mentally ill' cases | |
"illegal", | |
fr"\b(?<!mentally )(?:ill|illeg|illgl)\b", | |
"illegal", | |
), | |
RegexSubstitution("commercial fish", fr"\bcomm{sep}fish\b", "commercial fish"), | |
RegexSubstitution("vessel", fr"\bvess\b", "vessel"), | |
RegexSubstitution( | |
"traffic control device", | |
fr"\btraff{sep}control{sep}dev\b", | |
"traffic control device", | |
), | |
RegexSubstitution("non-culpable", fr"\bnonculp\b", "non-culpable"), | |
RegexSubstitution("prohibited", fr"\bprohib\b", "prohibited"), | |
RegexSubstitution("nuisance", fr"\bnuis\b", "nuisance"), | |
RegexSubstitution("obstruction", fr"\bobstr\b", "obstruction"), | |
RegexSubstitution("pedestrian", fr"\bped\b", "pedestrian"), | |
RegexSubstitution("conduct", fr"\bcond\b", "conduct", priority=20), | |
RegexSubstitution( | |
"subsequent", | |
fr"\bsubsq\b", | |
"subsequent", | |
), | |
RegexSubstitution( | |
"disturbing the peace", | |
fr"\bdist{sep}peace\b", | |
"disturbing the peace", | |
), | |
RegexSubstitution( | |
"offender accountability act", | |
fr"\boaa\b", | |
"offender accountability act", | |
), | |
RegexSubstitution( | |
"against", | |
fr"\b(?:agnst|agin)\b", | |
"against", | |
), | |
RegexSubstitution( | |
"child", | |
fr"\b(?:chil|chld)\b", | |
"child", | |
), | |
RegexSubstitution( | |
"school", | |
fr"\bschl\b", | |
"school", | |
), | |
RegexSubstitution( | |
"multiple", | |
fr"\bmult\b", | |
"multiple", | |
), | |
RegexSubstitution( | |
"assailant", | |
fr"\bassail\b", | |
"assailant", | |
), | |
RegexSubstitution( | |
"public disturbance", | |
fr"\b(?:public|pub|publ){sep}(?:disturbance|disturb|dist)\b", | |
"public disturbance", | |
), | |
RegexSubstitution( | |
"interfere", | |
fr"\b(?:interf|interfer)\b", | |
"interfere", | |
), | |
RegexSubstitution( # TODO should we leave obstructing/obstruction separate terms or lump into obstruct? | |
"obstructing", | |
fr"\bob\b", | |
"obstructing", | |
), | |
RegexSubstitution( | |
"law enforcement officer", | |
fr"\bleo\b", | |
"law enforcement officer", | |
), | |
RegexSubstitution( | |
"officer", | |
fr"\b(?:offcr|ofcr)\b", | |
"officer", | |
), | |
RegexSubstitution( | |
"minor", | |
fr"\b(?:min|minr|mnr)\b", | |
"minor", | |
), | |
RegexSubstitution( | |
"distance within 300 feet of park", | |
fr"\bdist{sep}300{sep}park\b", | |
"distance within 300 feet of park", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distance within 300", | |
fr"{sep}dist{sep}w{sep}i{sep}300\b", | |
"distance within 300", | |
priority=5, | |
), | |
RegexSubstitution( | |
"major", | |
fr"\bmajr\b", | |
"major", | |
), | |
RegexSubstitution( | |
"willful", | |
fr"\b(?:wilfl|wlfl)\b", | |
"willful", | |
), | |
RegexSubstitution( | |
"issue worthless checks", | |
fr"\b(?:issue|iss){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b", | |
"worthless", | |
), | |
RegexSubstitution( | |
"issue multiple worthless checks", | |
fr"\b(?:issue|iss){sep}(?:multiple|mltpl){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b", | |
"worthless", | |
), | |
RegexSubstitution( | |
"unauthorized", | |
fr"\b(?:unauth|unau|unauthd)\b", | |
"unauthorized", | |
), | |
RegexSubstitution( | |
"child support", | |
fr"\b(?:child|chld|chi){sep}(?:support|supp|sup)\b", | |
"child support", | |
), | |
RegexSubstitution( | |
"unlawful", | |
r"\b(?:unlawfully|unlaw|unlawfl|unlawf|unlwfl|unl)\b", | |
"unlawful", | |
), | |
RegexSubstitution( | |
"Possession", | |
r"\b(?:possess|poss?)\b", | |
"possession", | |
), | |
RegexSubstitution( | |
"Abetting", | |
r"\b(?:abett|abetted)\b", | |
"Abetting", | |
), | |
RegexSubstitution("emergency", r"\b(?:emerg|emer)\b", "emergency", priority=20), | |
RegexSubstitution( | |
"Attempted", | |
r"\b(?:att|atmpt)\b", | |
"attempted", | |
), | |
RegexSubstitution( # NOTE: added negative look ahead so we don't remap "at risk" to "attempted risk" | |
"Attempted 2", | |
r"\bat(?! risk)\b", | |
"attempted", | |
), | |
RegexSubstitution( | |
"Battery", | |
r"\bbatt\b", | |
"battery", | |
), | |
RegexSubstitution( | |
"Violation of Probation", | |
r"\bvop\b", | |
"violation of probation", | |
), | |
RegexSubstitution( # NOTE: removed 'con' because shows up in some DV-related text, may not be a one-size fits all regex / 'consp' to conspiracy or conspire? | |
"Conspiracy", | |
r"\b(?:consp|conspi|conspira|conspirc|consprc|consprcy|cnsprcy|conspr)\b", | |
"conspiracy", | |
), | |
RegexSubstitution( | |
"Property", | |
r"\bprop\b", | |
"property", | |
), | |
RegexSubstitution( | |
"public disturbance", | |
fr"\b(?:public|pub|publ){sep}(?:disturbance|dist)\b", | |
"public disturbance", | |
), | |
RegexSubstitution( | |
"Criminal", | |
r"\bcrim\b", | |
"criminal", | |
), | |
RegexSubstitution( | |
"License", | |
r"\blic\b", | |
"license", | |
), | |
RegexSubstitution( | |
"Credit Card", | |
r"\bcc\b", | |
"credit card", | |
), | |
RegexSubstitution( | |
"Credit Card 2", | |
r"\bcred{sep}crd\b", | |
"credit card", | |
), | |
RegexSubstitution( | |
"exchange", | |
r"\bexch\b", | |
"exchange", | |
), | |
RegexSubstitution( | |
"electric power", | |
fr"\belec{sep}pwr\b", | |
"electric power", | |
), | |
RegexSubstitution( | |
"commit false", fr"\bcom?{sep}false\b", "commit false", priority=5 | |
), | |
# VEHICLE terms =========== | |
RegexSubstitution( | |
"Vehicle", | |
r"\b(?:veh|vehi)\b", | |
"vehicle", | |
), | |
RegexSubstitution( | |
"Vehicles", | |
r"\bvehs\b", | |
"vehicles", | |
), | |
RegexSubstitution( | |
"commercial motor vehicle", | |
r"\bcmv\b", | |
"commercial motor vehicle", | |
), | |
RegexSubstitution( | |
"motor vehicle", | |
fr"\b(?:mtr|mot){sep}(?:vehicle|veh)\b", | |
"motor vehicle", | |
), | |
RegexSubstitution( | |
"motor vehicle 2", | |
fr"\bm{sep}v\b", | |
"motor vehicle", | |
), | |
RegexSubstitution( | |
"motor vehicle 3", | |
fr"\b(?:mtv|mv)\b", | |
"motor vehicle", | |
), | |
RegexSubstitution("odometer", fr"\bodom\b", "odometer"), | |
RegexSubstitution( | |
"red light", | |
fr"\bred{sep}light\b", | |
"red light", | |
), | |
RegexSubstitution( | |
"vehicle sound system", | |
fr"\bveh{sep}snd{sep}sys\b", | |
"vehicle sound system", | |
priority=20, | |
), | |
# ===== | |
RegexSubstitution( | |
"Assault", | |
r"\bass?lt\b", | |
"assault", | |
), | |
RegexSubstitution( | |
"Assault 2", | |
r"\bass\b", | |
"assault", | |
), | |
RegexSubstitution( | |
"Mentally", | |
r"\bment\b", | |
"mentally", | |
), | |
RegexSubstitution( | |
"mentally ill", | |
r"\bmnt{sep}ill\b", | |
"mentally ill", | |
), | |
RegexSubstitution( | |
"Unknown", | |
r"\bunk\b", | |
"unknown", | |
), | |
RegexSubstitution( | |
"cohabitation", | |
r"\b(?:coh|cohbt)\b", | |
"cohabitation", | |
), | |
RegexSubstitution( | |
"Statement", | |
r"\bstmt\b", | |
"statement", | |
), | |
RegexSubstitution( | |
"Degree", | |
r"\bdegr?e?\b", | |
"degree", | |
), | |
RegexSubstitution( | |
"Felony", | |
r"\b(?:fe|fel|felo|felny|fl|flny)\b", | |
"felony", | |
), | |
RegexSubstitution( | |
"misdemeanor", | |
r"\bmisd\b", | |
"misdemeanor", | |
), | |
# AGE | |
RegexSubstitution( | |
"years of age", | |
r"\byoa\b", | |
"years of age", | |
), | |
RegexSubstitution( | |
"year", | |
r"\byr\b", | |
"year", | |
), | |
RegexSubstitution( | |
"year 2", | |
r"(?!\d+)yr\b", | |
" year", | |
), | |
RegexSubstitution( | |
"elderly", | |
r"\beldrly\b", | |
"elderly", | |
), | |
RegexSubstitution( | |
"under", | |
r"\b(?:und|undr)\b", | |
"under", | |
), | |
# AGE / FEMALE | |
RegexSubstitution( | |
"female", | |
fr"\bfem\b", | |
"female", | |
), | |
RegexSubstitution( | |
"age female", | |
fr"\bage{sep}f\b", | |
"age female", | |
), | |
RegexSubstitution( | |
"old female", | |
fr"\bold{sep}f\b", | |
"old female", | |
), | |
RegexSubstitution( | |
"older female", | |
fr"\bolder{sep}f\b", | |
"older female", | |
), | |
RegexSubstitution( | |
"13 female", | |
fr"\b13{sep}f\b", | |
"13 female", | |
), | |
RegexSubstitution( | |
"15 female", | |
fr"\b15{sep}f\b", | |
"15 female", | |
), | |
RegexSubstitution( | |
"17 female", | |
fr"\b17{sep}f\b", | |
"17 female", | |
), | |
# AGE / MALE | |
RegexSubstitution( | |
"age male", | |
fr"\bage{sep}m\b", | |
"age male", | |
), | |
RegexSubstitution( | |
"old male", | |
fr"\bold{sep}m\b", | |
"old male", | |
), | |
RegexSubstitution( | |
"older male", | |
fr"\bolder{sep}m\b", | |
"older male", | |
), | |
RegexSubstitution( | |
"13 male", | |
fr"\b13{sep}m\b", | |
"13 male", | |
), | |
RegexSubstitution( | |
"15 male", | |
fr"\b15{sep}m\b", | |
"15 male", | |
), | |
RegexSubstitution( | |
"17 male", | |
fr"\b17{sep}m\b", | |
"17 male", | |
), | |
# ====== | |
RegexSubstitution( | |
"Robbery", | |
r"\brobb\b", | |
"robbery", | |
), | |
RegexSubstitution( | |
"Attempted Robbery", | |
fr"\battempted{sep}(?:rob|robb)\b", | |
"attempted robbery", | |
), | |
RegexSubstitution( | |
"Detainer Robbery", | |
fr"\bdetainer{sep}(?:rob|robb)\b", | |
"detainer robbery", | |
), | |
RegexSubstitution( | |
"Aggravated", | |
r"\b(?:agg|aggrav|aggr|aggravted)\b", | |
"aggravated", | |
), | |
RegexSubstitution( | |
"Forced", | |
r"\bfrc\b", | |
"forced", | |
), | |
RegexSubstitution( | |
"Danger", | |
r"\bdng\b", | |
"danger", | |
), | |
RegexSubstitution( | |
"Abetting", | |
r"\babet\b", | |
"abetting", | |
), | |
RegexSubstitution( | |
"Acquaintance", | |
r"\b(?:acquant|acq|acquaint|acquain)\b", | |
"acquaintance", | |
), | |
RegexSubstitution( | |
"Breaking and Entering", | |
r"\bB ?& ?E\b", | |
"breaking and entering", | |
), | |
RegexSubstitution("Building", r"\bbldg\b", "building"), | |
RegexSubstitution( | |
"Adult", | |
r"\badlt\b", | |
"adult", | |
), | |
RegexSubstitution( | |
"Deliver", | |
r"\bdel\b", | |
"deliver", | |
), | |
RegexSubstitution( | |
"Family", | |
r"\bfam\b", | |
"family", | |
), | |
RegexSubstitution( | |
"Burglary", | |
r"\bburg\b", | |
"burglary", | |
), | |
RegexSubstitution( | |
"Murder", | |
r"\bmur\b", | |
"murder", | |
), | |
RegexSubstitution( | |
"conspiracy to commit", | |
fr"\bconsp{sep}comm\b", | |
"conspiracy to commit", | |
), | |
RegexSubstitution( | |
"Representation", | |
r"\brep\b", | |
"representation", | |
), | |
RegexSubstitution( | |
"Previous", | |
r"\bprev\b", | |
"previous", | |
), | |
RegexSubstitution( # TODO revisit this - 'com' can also be 'commit' | |
"Common", | |
r"\bcom\b", | |
"common", | |
), | |
RegexSubstitution( | |
"of a", | |
r"\bofa\b", | |
"of a", | |
), | |
RegexSubstitution( # TODO revisit this - 'viol' relates to 'violation' too | |
"violent", | |
r"\bviol\b", | |
"violent", | |
), | |
RegexSubstitution( | |
"perform", | |
r"\bperf\b", | |
"perform", | |
), | |
RegexSubstitution( | |
"household", | |
r"\b(?:hh|hsehld|hhld)\b", | |
"household", | |
), | |
RegexSubstitution( | |
"Other", | |
r"\both\b", | |
"other", | |
), | |
# WEAPON TERMS ========= | |
RegexSubstitution( | |
"Weapon", r"\b(?:wea|wpn|weapn|weap|weapo)\b", "weapon", priority=20 | |
), | |
RegexSubstitution( | |
"Weapons", r"\b(?:wea|wpn|weapn|weap|weapo)s\b", "weapons", priority=20 | |
), | |
RegexSubstitution("dangerous weapon", r"\b(?:dwpn|dw)\b", "dangerous weapon"), | |
RegexSubstitution( | |
"dangerous weapon 2", fr"\bd{sep}(?:w|wpn)\b", "dangerous weapon" | |
), | |
RegexSubstitution( | |
"concealed weapon", fr"\bconcealed{sep}(?:w|wpn)\b", "concealed weapon" | |
), | |
# HARM terms ======= | |
RegexSubstitution( | |
"Bodily Harm", | |
fr"\b(?:bod{sep}ha?rm|bh)\b", | |
"bodily harm", | |
), | |
RegexSubstitution( | |
"physical", | |
fr"\bphy\b", | |
"physical", | |
), | |
RegexSubstitution( | |
"harmful", | |
fr"\bharmfl\b", | |
"harmful", | |
), | |
RegexSubstitution( | |
"Great Bodily", | |
fr"\b(?:gr|grt){sep}bodily\b", | |
"great bodily", | |
), | |
RegexSubstitution( | |
"Great Bodily Injury", | |
fr"\bgbi\b", | |
"great bodily injury", | |
), | |
RegexSubstitution( | |
"Substantial Bodily Harm", | |
r"\bsbh\b", | |
"substantial bodily harm", | |
), | |
RegexSubstitution( | |
"injury", | |
r"\b(?:injry|inj)\b", | |
"injury", | |
), | |
RegexSubstitution( | |
"inflict", | |
r"\binflt\b", | |
"inflict", | |
), | |
RegexSubstitution( | |
"Great Bodily Harm", | |
fr"\bgr{sep}bod{sep}harm\b", | |
"great bodily harm", | |
), | |
RegexSubstitution( | |
"Great Bodily Harm 2", | |
fr"\bgbh\b", | |
"great bodily harm", | |
), | |
# ==== | |
RegexSubstitution( # TODO: revisit PERS can be person too | |
"Personal", | |
r"\bpers\b", | |
"personal", | |
), | |
RegexSubstitution( | |
"persons", | |
r"\bprsns\b", | |
"persons", | |
), | |
RegexSubstitution( | |
"person", | |
r"\b(?:prsn|per|perso)\b", | |
"person", | |
), | |
RegexSubstitution("election day", fr"\belec{sep}day\b", "election day"), | |
RegexSubstitution( | |
"temporary", | |
r"\btemp\b", | |
"temporary", | |
), | |
RegexSubstitution( | |
"improper", | |
r"\bimprop\b", | |
"improper", | |
), | |
RegexSubstitution( | |
"false", | |
r"\bfls\b", | |
"false", | |
), | |
RegexSubstitution( | |
"responsibility", | |
r"\bresp\b", | |
"responsibility", | |
), | |
RegexSubstitution( | |
"advertise", | |
r"\bad\b", | |
"advertise", | |
), | |
RegexSubstitution( | |
"imprisonment", | |
r"\b(?:imprison|impris|imprsn)\b", | |
"imprisonment", | |
), | |
RegexSubstitution( | |
"prohibited", | |
r"\bproh\b", | |
"prohibited", | |
), | |
RegexSubstitution( | |
"under influence", | |
fr"\bunder{sep}(?:infl|influ)\b", | |
"under influence", | |
priority=5, | |
), | |
RegexSubstitution( | |
"stolen", | |
r"\bstln\b", | |
"stolen", | |
), | |
RegexSubstitution( | |
"years", | |
r"\byrs\b", | |
"years", | |
), | |
RegexSubstitution( | |
"intent", | |
r"\bint\b", | |
"intent", | |
), | |
RegexSubstitution( | |
"passage", | |
r"\bpassg\b", | |
"passage", | |
), | |
RegexSubstitution( | |
"withdraw", | |
r"\bwit\b", | |
"withdraw", | |
), | |
RegexSubstitution( | |
"manufacturing or delivering", | |
r"\bman\Wdel\b", | |
"manufacturing delivering", | |
), | |
RegexSubstitution( # Revisit this | |
"minimum mandatory", | |
r"\bmin\Wman\b", | |
"minimum mandatory", | |
), | |
RegexSubstitution( | |
"stranger", | |
r"\bstr(?:ngr)?\b", | |
"stranger", | |
), | |
RegexSubstitution( | |
"personal use", | |
r"\bpers use\b", | |
"personal use", | |
), | |
RegexSubstitution( | |
"force", | |
r"\bfo?rc\b", | |
"force", | |
), | |
RegexSubstitution( | |
"operate", | |
r"\b(?:oper|op|opr)\b", | |
"operate", | |
), | |
RegexSubstitution( | |
"occupied", | |
r"\bocc\b", | |
"occupied", | |
), | |
RegexSubstitution( | |
"health care facility", | |
r"\bhealth{sep}care{sep}fac\b", | |
"health care facility", | |
priority=5, | |
), | |
RegexSubstitution( | |
"residence", | |
r"\bres\b", | |
"residence", | |
), | |
RegexSubstitution( | |
"terrorism threats", | |
fr"\bterr{sep}(?:thre|thrts)\b", | |
"terrorism threats", | |
), | |
RegexSubstitution( | |
"false report", | |
fr"\bfals{sep}rprt\b", | |
"false report", | |
), | |
RegexSubstitution( | |
"government", | |
r"\bgovt\b", | |
"government", | |
), | |
RegexSubstitution( | |
"advocating", | |
r"\badvoc\b", | |
"advocating", | |
), | |
RegexSubstitution( | |
"government property", | |
r"\bgov{sep}property\b", | |
"government property", | |
), | |
RegexSubstitution( | |
"general assembly", | |
r"\bgen{sep}assembly\b", | |
"general assembly", | |
), | |
RegexSubstitution( # NOTE: added negative lookahead because was seeing "by off" when updating statutory rape terms & "by offense" is not correct | |
"offense", | |
fr"\b(?<!by )(?:offense|offen|off|offe)\b", | |
"offense", | |
), | |
RegexSubstitution( | |
"information", | |
fr"\b(?:info|infor)\b", | |
"information", | |
), | |
# LEWD charge cat | |
RegexSubstitution( | |
"pornography", | |
fr"\b(?:porn|porno)\b", | |
"pornography", | |
), | |
RegexSubstitution( | |
"compelling", | |
fr"\bcompel\b", | |
"compelling", | |
), | |
RegexSubstitution( | |
"prostitution", | |
fr"\bprostit\b", | |
"prostitution", | |
), | |
RegexSubstitution( | |
"computer", | |
fr"\bcomputr\b", | |
"computer", | |
), | |
RegexSubstitution( | |
"incapable", | |
fr"\bincap\b", | |
"incapable", | |
), | |
RegexSubstitution( | |
"juvenile", | |
fr"\b(?:juv|juven)\b", | |
"juvenile", | |
), | |
RegexSubstitution( | |
"involving", | |
fr"\b(?:involv|invlv)\b", | |
"involving", | |
), | |
RegexSubstitution( | |
"equipment", | |
fr"\bequip\b", | |
"equipment", | |
), | |
RegexSubstitution( | |
"hazardous", | |
fr"\bhaz\b", | |
"hazardous", | |
), | |
RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C | |
"assault and battery", | |
fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!c)\b", | |
"assault and battery", | |
), | |
RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C | |
"assault and battery 2", | |
fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!\Wc)\b", | |
"assault and battery", | |
), | |
RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C | |
"assault and battery 2", | |
fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?! c)\b", | |
"assault and battery", | |
), | |
RegexSubstitution( | |
"promote distribution", | |
fr"\bpromote{sep}distrb\b", | |
"promote distribution", | |
), | |
RegexSubstitution( | |
"child molestation first degree", | |
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}1\b", | |
"child molestation first degree", | |
), | |
RegexSubstitution( | |
"child molestation second degree", | |
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}2\b", | |
"child molestation second degree", | |
), | |
RegexSubstitution( | |
"child molestation third degree", | |
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}3\b", | |
"child molestation third degree", | |
), | |
RegexSubstitution( | |
"child molestation", | |
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol)\b", | |
"child molestation", | |
priority=5, | |
), | |
RegexSubstitution( | |
"molestation", | |
fr"\b(?:molestation|molest|mol)\b", | |
"molestation", | |
), | |
RegexSubstitution( | |
"indecent conduct exposure", | |
fr"\bind{sep}cond{sep}expos\b", | |
"indecent conduct exposure", | |
), | |
RegexSubstitution( | |
"indecent", | |
fr"\bindec\b", | |
"indecent", | |
), | |
RegexSubstitution( | |
"indecent liberties", | |
fr"\bind{sep}lib\b", | |
"indecent liberties", | |
), | |
RegexSubstitution( | |
"moving", | |
fr"\bmov\b", | |
"moving", | |
), | |
RegexSubstitution( | |
"depiction", | |
fr"\bdptn\b", | |
"depiction", | |
), | |
RegexSubstitution( | |
"child luring", | |
fr"\bchil{sep}lrng\b", | |
"child luring", | |
), | |
RegexSubstitution( | |
"dissemination", | |
fr"\b(?:dissm|dissem)\b", | |
"dissemination", | |
), | |
RegexSubstitution( | |
"possession of depictions of minor engaged in sexually explicit conduct", | |
fr"\bposs{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b", | |
"possession of depictions of minor engaged in sexually explicit conduct", | |
priority=3, | |
), | |
RegexSubstitution( | |
"dealing of depictions of minor engaged in sexually explicit conduct", | |
fr"\bdeal{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b", | |
"dealing of depictions of minor engaged in sexually explicit conduct", | |
priority=3, | |
), | |
RegexSubstitution( | |
"viewing of depictions of minor engaged in sexually explicit conduct", | |
fr"\bview{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b", | |
"viewing of depictions of minor engaged in sexually explicit conduct", | |
priority=3, | |
), | |
RegexSubstitution( | |
"online sexual corruption of a child", | |
fr"\bonline{sep}sex{sep}corrupt{sep}child\b", | |
"online sexual corruption of a child", | |
), | |
RegexSubstitution( | |
"lewd or lascivious act", | |
fr"\b(?:L\&L|L{sep}L)\b", | |
"lewd or lascivious act", | |
), | |
RegexSubstitution( | |
"exposure", | |
r"\bexpos\b", | |
"exposure", | |
), | |
# SEXUAL OFFENSES ===== | |
RegexSubstitution( | |
"Criminal Sexual Conduct", | |
r"\bcsc\b", | |
"criminal sexual conduct", | |
), | |
RegexSubstitution( | |
"sexual", | |
r"\bsexl\b", | |
"sexual", | |
), | |
RegexSubstitution( | |
"explicit", | |
r"\bexplct\b", | |
"explicit", | |
), | |
RegexSubstitution( | |
"sexual offense", | |
fr"\b(?:sexual|sex){sep}(?:offense|offen|off)\b", | |
"sexual offense", | |
), | |
RegexSubstitution( | |
"sexual offenses", | |
fr"\b(?:sexual|sex){sep}(?:offense|offen|off)s\b", | |
"sexual offenses", | |
), | |
RegexSubstitution( | |
"sexual assault", | |
fr"\b(?:sexual|sex){sep}(?:assault|assult|assualt|ass|asst)\b", | |
"sexual assault", | |
), | |
RegexSubstitution( | |
"sexual contact", | |
fr"\b(?:sexual|sex){sep}(?:contact)\b", | |
"sexual contact", | |
), | |
RegexSubstitution( | |
"sexual act", | |
fr"\b(?:sexual|sex){sep}(?:act|acts)\b", | |
"sexual act", | |
), | |
RegexSubstitution( | |
"sexual act 2", | |
fr"\bsxact\b", | |
"sexual act", | |
), | |
RegexSubstitution( | |
"sexual abuse", | |
fr"\b(?:sexual|sex){sep}(?:abuse|ab)\b", | |
"sexual abuse", | |
), | |
RegexSubstitution( | |
"commit sex abuse", | |
fr"\bcomm{sep}sex{sep}abuse\b", | |
"commit sex abuse", | |
), | |
RegexSubstitution( | |
"commit sex act", | |
fr"\bcomm{sep}sex{sep}act\b", | |
"commit sex act", | |
), | |
RegexSubstitution( | |
"commit sex abuse minor", | |
fr"\bcommsexabuseminor\b", | |
"commit sex abuse minor", | |
priority=20, | |
), | |
RegexSubstitution( | |
"sexual battery", | |
fr"\b(?:sexual|sex){sep}(?:battery|batt|bat)\b", | |
"sexual battery", | |
), | |
RegexSubstitution( # TODO: should these actually map to "sexual misconduct"? | |
"sexual conduct", | |
fr"\b(?:sexual|sex){sep}(?:conduct|cndct|cond|con)\b", | |
"sexual conduct", | |
), | |
RegexSubstitution( | |
"sexual penetration", | |
fr"\b(?:sexual|sex){sep}(?:penetration|pen)\b", | |
"sexual penetration", | |
), | |
RegexSubstitution( # TODO: Revisit - hard to tell if exp/expl maps to "exploitation" or "explicit" | |
"sexual exploitation", | |
fr"\b(?:sexual|sex){sep}(?:exploitation|exploit)\b", | |
"sexual exploitation", | |
), | |
RegexSubstitution( | |
"sexual performance", | |
fr"\b(?:sexual|sex){sep}(?:performance|perform)\b", | |
"sexual performance", | |
), | |
RegexSubstitution( | |
"sexual imposition", | |
fr"\b(?:sexual|sex){sep}(?:imposition|imp)\b", | |
"sexual imposition", | |
), | |
RegexSubstitution( | |
"sex with", | |
fr"\bsex{sep}w\b", | |
"sex with", | |
), | |
RegexSubstitution( # TODO: Revisit - hard to tell if offen/off maps to "offender" or "offense" | |
"sex offender", | |
fr"\b(?:sexual|sex){sep}(?:offender|offend|offndr|ofndr)\b", | |
"sex offender", | |
), | |
RegexSubstitution( | |
"sexual predator", | |
fr"\b(?:sexual|sex){sep}(?:predator|pred)\b", | |
"sexual predator", | |
), | |
RegexSubstitution( | |
"voluntary sexual relations", | |
fr"\bvol{sep}sex{sep}rel\b", | |
"voluntary sexual relations", | |
), | |
RegexSubstitution( | |
"sex related", | |
fr"\bsex{sep}(?:reltd|rel)\b", | |
"sex related", | |
), | |
RegexSubstitution( | |
"sex related 2", | |
fr"\bsexreltd\b", | |
"sex related", | |
), | |
RegexSubstitution( | |
"statutory rape", | |
fr"\bstat{sep}rape\b", | |
"statutory rape", | |
), | |
RegexSubstitution( | |
"rape first degree", | |
fr"\brape{sep}(?:1|1st|i)\b", | |
"rape first degree", | |
), | |
RegexSubstitution( | |
"rape second degree", | |
fr"\brape{sep}(?:2|2nd|ii)\b", | |
"rape second degree", | |
), | |
RegexSubstitution( | |
"rape third degree", | |
fr"\brape{sep}(?:3|3rd|iii)\b", | |
"rape third degree", | |
), | |
RegexSubstitution( | |
"sodomy first degree", | |
fr"\bsodomy{sep}(?:1|1st|i)\b", | |
"sodomy first degree", | |
), | |
RegexSubstitution( | |
"sodomy second degree", | |
fr"\bsodomy{sep}(?:2|2nd|ii)\b", | |
"sodomy second degree", | |
), | |
RegexSubstitution( | |
"sodomy third degree", | |
fr"\bsodomy{sep}(?:3|3rd|iii)\b", | |
"sodomy third degree", | |
), | |
RegexSubstitution( | |
"incest first degree", | |
fr"\bincest{sep}(?:1|1st|i)\b", | |
"incest first degree", | |
), | |
RegexSubstitution( | |
"incest second degree", | |
fr"\bincest{sep}(?:2|2nd|ii)\b", | |
"incest second degree", | |
), | |
RegexSubstitution( | |
"sex first degree", | |
fr"\bsex{sep}(?:1|1st|i)\b", | |
"sex first degree", | |
), | |
RegexSubstitution( | |
"sex second degree", | |
fr"\bsex{sep}(?:2|2nd|ii)\b", | |
"sex second degree", | |
), | |
RegexSubstitution( | |
"criminal sexual conduct first degree", | |
fr"\bcsc{sep}(?:1|1st|i)\b", | |
"criminal sexual conduct first degree", | |
priority=5, | |
), | |
RegexSubstitution( | |
"criminal sexual conduct second degree", | |
fr"\bcsc{sep}(?:2|2nd|ii)\b", | |
"criminal sexual conduct second degree", | |
priority=5, | |
), | |
RegexSubstitution( | |
"criminal sexual conduct third degree", | |
fr"\bcsc{sep}(?:3|3rd|ii)\b", | |
"criminal sexual conduct third degree", | |
priority=5, | |
), | |
RegexSubstitution( | |
"criminal sexual conduct fourth degree", | |
fr"\bcsc{sep}(?:4|4th|iv)\b", | |
"criminal sexual conduct fourth degree", | |
priority=5, | |
), | |
RegexSubstitution( | |
"sodomy", | |
r"\bsod\b", | |
"sodomy", | |
), | |
RegexSubstitution( | |
"engage sexual act", | |
fr"\benga{sep}sex{sep}act\b", | |
"engage sexual act", | |
), | |
RegexSubstitution( | |
"engage sexual act 2", | |
fr"\beng{sep}sex\b", | |
"engage sexual act", | |
), | |
RegexSubstitution("no force", fr"\bno{sep}frc\b", "no force", priority=5), | |
RegexSubstitution( | |
"force or coercion", | |
fr"\bfrc{sep}or{sep}coercn\b", | |
"force or coercion", | |
priority=5, | |
), | |
RegexSubstitution( | |
"coercion", | |
fr"\b(?:coer|coercn)\b", | |
"coercion", | |
), | |
RegexSubstitution( | |
"position of authority", | |
fr"\bpos{sep}auth\b", | |
"position of authority", | |
priority=4, | |
), | |
RegexSubstitution( | |
"position of authority 2", | |
fr"\bpos{sep}of{sep}auth\b", | |
"position of authority", | |
priority=4, | |
), | |
RegexSubstitution( | |
"person in authority", | |
fr"\bper{sep}aut\b", | |
"person in authority", | |
priority=4, | |
), | |
RegexSubstitution( | |
"other family", | |
fr"\b(?:othr|oth|other){sep}(?:family|fam)\b", | |
"other family", | |
priority=4, | |
), | |
RegexSubstitution( | |
"immoral", | |
fr"\b(?:immoral|imoral|imm|imor)\b", | |
"immoral", | |
priority=4, | |
), | |
RegexSubstitution( | |
"purpose", | |
fr"\bpurp\b", | |
"purpose", | |
priority=4, | |
), | |
RegexSubstitution( | |
"communication with minor for immoral purpose", | |
fr"\b(?:communication|comm|com){sep}(?:with|w){sep}(?:minor|min){sep}(?:immoral|imoral|imm|imor)\b", | |
"communication with minor for immoral purpose", | |
priority=4, | |
), | |
RegexSubstitution( | |
"communication with minor for immoral purpose 2", | |
fr"\bcomm{sep}minor{sep}imm\b", | |
"communication with minor for immoral purpose", | |
priority=4, | |
), | |
RegexSubstitution( | |
"communication with minor", | |
fr"\bcom{sep}w{sep}minor\b", | |
"communication with minor", | |
priority=4, | |
), | |
# EMBEZZLEMENT === | |
RegexSubstitution( | |
"Embezzlement", | |
r"\b(?:embezzle|embezz|embez|embzzlmnt|embz)\b", | |
"embezzlement", | |
), | |
RegexSubstitution( | |
"real estate", | |
fr"\breal{sep}estat\b", | |
"real estate", | |
), | |
RegexSubstitution( | |
"chattel", | |
r"\bchatl\b", | |
"chattel", | |
), | |
RegexSubstitution( | |
"received", | |
r"\b(?:receiv|rcvd)\b", | |
"received", | |
), | |
RegexSubstitution( | |
"mortgagor", | |
r"\bmortgr\b", | |
"mortgagor", | |
), | |
RegexSubstitution( | |
"agreement", | |
r"\bagrmnt\b", | |
"agreement", | |
), | |
RegexSubstitution( | |
"public", | |
fr"\b(?:pub|publ|pblc)\b", | |
"public", | |
), | |
RegexSubstitution( | |
"behavior", | |
r"\bbehav\b", | |
"behavior", | |
), | |
RegexSubstitution( | |
"private", | |
r"\bpriv\b", | |
"private", | |
), | |
RegexSubstitution( | |
"corporation", | |
fr"\bcorp\b", | |
"corporation", | |
), | |
RegexSubstitution( | |
"purchase", | |
fr"\bpurc\b", | |
"purchase", | |
), | |
RegexSubstitution( # NOTE: pol may also be police - saw pol dog for example (police dog) | |
"political", | |
fr"\b(?:pol|polit|politcl)\b", | |
"political", | |
), | |
RegexSubstitution("police dog", fr"\bpol{sep}dog\b", "police dog", priority=5), | |
RegexSubstitution( | |
"payroll", | |
fr"\bpayrll\b", | |
"payroll", | |
), | |
RegexSubstitution( | |
"law enforcement", | |
fr"\blaw{sep}enf\b", | |
"law enforcement", | |
), | |
RegexSubstitution( | |
"incident", | |
fr"\bincdnt\b", | |
"incident", | |
), | |
RegexSubstitution( | |
"report", | |
fr"\brept\b", | |
"report", | |
), | |
RegexSubstitution( | |
"transfer", | |
fr"\btrnsf\b", | |
"transfer", | |
), | |
RegexSubstitution( | |
"capital assets", | |
fr"\bcptl{sep}asts\b", | |
"capital assets", | |
), | |
RegexSubstitution( | |
"clerk of court", | |
fr"\bclrk{sep}of{sep}crt\b", | |
"clerk of court", | |
), | |
RegexSubstitution( | |
"insufficient", | |
fr"\binsuf\b", | |
"insufficient", | |
), | |
RegexSubstitution( | |
"corporate officer", fr"\bcorp{sep}officer\b", "corporate officer", priority=5 | |
), | |
RegexSubstitution( | |
"institution", | |
fr"\b(?:instit|inst)\b", | |
"institution", | |
), | |
RegexSubstitution( | |
"organization", | |
fr"\borg\b", | |
"organization", | |
), | |
RegexSubstitution( | |
"animals", | |
fr"\banmls\b", | |
"animals", | |
), | |
RegexSubstitution( | |
"animal", | |
fr"\banml\b", | |
"animal", | |
), | |
RegexSubstitution( | |
"software", | |
fr"\bsoftwr\b", | |
"software", | |
), | |
RegexSubstitution( | |
"transit or service bus", | |
fr"\btrans{sep}serv{sep}bus\b", | |
"transit or service bus", | |
), | |
RegexSubstitution( | |
"insurance agent", | |
fr"\binsur{sep}agent\b", | |
"insurance agent", | |
), | |
RegexSubstitution( | |
"official", | |
fr"\b(?:offic|offl|offcl|officl)\b", | |
"official", | |
), | |
RegexSubstitution( # TODO: is 'misapp' ... misappropriation or misapplication? | |
"misappropriation", | |
fr"\b(?:misappro|misapp)\b", | |
"misappropriation", | |
), | |
RegexSubstitution( | |
"misapplication", | |
fr"\bmisapl\b", | |
"misappropriation", | |
), | |
RegexSubstitution( | |
"fiduciary", | |
fr"\bfiduc\b", | |
"fiduciary", | |
), | |
RegexSubstitution( | |
"financial", | |
fr"\bfinan\b", | |
"financial", | |
), | |
RegexSubstitution( | |
"funds", | |
fr"\bfnds\b", | |
"funds", | |
), | |
# FELONY - UNSPECIFIED terms | |
RegexSubstitution( | |
"rendering assistance", | |
fr"\brend{sep}assist\b", | |
"rendering assistance", | |
priority=5, | |
), | |
RegexSubstitution( | |
"criminal assistance", | |
fr"\b(?:crim|criminal){sep}assist\b", | |
"criminal assistance", | |
priority=4, | |
), | |
RegexSubstitution( | |
"consummate", | |
fr"\b(?:consu|consummat)\b", | |
"consummate", | |
priority=4, | |
), | |
RegexSubstitution( | |
"deliver", | |
fr"\bdelive\b", | |
"deliver", | |
priority=4, | |
), | |
RegexSubstitution( | |
"to commit", | |
fr"\bto{sep}comm\b", | |
"to commit", | |
priority=4, | |
), | |
RegexSubstitution( | |
"violation of", | |
fr"\b(?:viol?|vio){sep}of\b", | |
"violation of", | |
priority=4, | |
), | |
RegexSubstitution( | |
"violation of civil", | |
fr"\bvol?{sep}civil\b", | |
"violation of civil", | |
priority=4, | |
), | |
RegexSubstitution("rendering", fr"\brend\b", "rendering"), | |
RegexSubstitution( | |
"assistance first degree", | |
fr"\bassistance{sep}1\b", | |
"assistance first degree", | |
priority=30, | |
), | |
RegexSubstitution( | |
"assistance second degree", | |
fr"\bassistance{sep}2\b", | |
"assistance second degree", | |
priority=30, | |
), | |
RegexSubstitution( | |
"assistance third degree", | |
fr"\bassistance{sep}3\b", | |
"assistance third degree", | |
priority=30, | |
), | |
RegexSubstitution( | |
"class", | |
fr"\bclas\b", | |
"class", | |
), | |
RegexSubstitution( | |
"accessory", | |
fr"\b(?:accessry|accsry)\b", | |
"accessory", | |
), | |
RegexSubstitution( | |
"dependency", | |
fr"\bdepndncy\b", | |
"dependency", | |
), | |
RegexSubstitution( | |
"unspecified", | |
fr"\bunspfd\b", | |
"unspecified", | |
), | |
RegexSubstitution( | |
"responsibility", | |
fr"\brespon?\b", | |
"responsibility", | |
), | |
RegexSubstitution( | |
"classification", | |
fr"\bclassif\b", | |
"classification", | |
), | |
RegexSubstitution( | |
"vice president", | |
fr"\bvp\b", | |
"vice president", | |
priority=30, | |
), | |
# BRIBERY terms | |
RegexSubstitution( | |
"personal", | |
fr"\bpersona\b", | |
"personal", | |
), | |
RegexSubstitution( | |
"assistance", | |
fr"\basst\b", | |
"assistance", | |
), | |
RegexSubstitution( | |
"service", | |
fr"\bserv\b", | |
"service", | |
), | |
RegexSubstitution( | |
"facilitation", | |
fr"\b(?:facil|fac)\b", | |
"facilitation", | |
), | |
RegexSubstitution( | |
"smuggling", | |
fr"\bsmug\b", | |
"smuggling", | |
), | |
RegexSubstitution( | |
"health", | |
fr"\bhlth\b", | |
"health", | |
), | |
RegexSubstitution( # NOTE: 'off' tends to be 'offense' hence the priority on this one | |
"official position", fr"\boff{sep}position\b", "official position", priority=5 | |
), | |
RegexSubstitution( | |
"participants", | |
fr"\bparticipnts\b", | |
"participants", | |
), | |
RegexSubstitution( | |
"contestant", | |
fr"\bcntst\b", | |
"contestant", | |
), | |
RegexSubstitution( | |
"accept", | |
fr"\baccpt\b", | |
"accept", | |
), | |
RegexSubstitution( | |
"campaign contribution", | |
fr"\bcamp{sep}cont\b", | |
"campaign contribution", | |
), | |
RegexSubstitution( | |
"influence", | |
fr"\b(?:inflnce|influenc)\b", | |
"influence", | |
), | |
RegexSubstitution( | |
"compensation", | |
fr"\bcompens\b", | |
"compensation", | |
), | |
RegexSubstitution( | |
"treatment", | |
fr"\btreatm\b", | |
"treatment", | |
), | |
RegexSubstitution( | |
"commercial bribe", | |
fr"\b(?:comm|comm\'l){sep}bribe\b", | |
"commercial bribe", | |
), | |
RegexSubstitution( | |
"false testimony", | |
fr"\bfalse{sep}test\b", | |
"false testimony", | |
), | |
RegexSubstitution( | |
"miscellaneous", | |
fr"\bmisc\b", | |
"miscellaneous", | |
), | |
RegexSubstitution( | |
"impersonating", | |
fr"\bimpers\b", | |
"impersonating", | |
), | |
RegexSubstitution( | |
"receiving", | |
fr"\brecv\b", | |
"receiving", | |
), | |
RegexSubstitution( | |
"interfere with official process", | |
fr"\binterfere{sep}w{sep}offc{sep}proc\b", | |
"interfere with official process", | |
priority=5, | |
), | |
RegexSubstitution("public record", fr"\b(?:public|pub){sep}rec\b", "public record"), | |
RegexSubstitution( | |
"public servant", | |
fr"\b(?:public|pub){sep}(?:servant|srv|srvnt)\b", | |
"public servant", | |
), | |
RegexSubstitution( # NOTE: 'wit' also maps to 'withdraw', hence priority here | |
"witness juror", | |
fr"\b(?:witness|wit){sep}(?:juror|jur)\b", | |
"witness juror", | |
priority=5, | |
), | |
RegexSubstitution( | |
"umpire referee", fr"\b(?:umpire|ump){sep}(?:referee|ref)\b", "umpire referee" | |
), | |
# FAMILY RELATED OFFENSES | |
RegexSubstitution( | |
"custody interference", | |
fr"\bcust{sep}inter\b", | |
"custody interference", | |
), | |
RegexSubstitution( | |
"custody interference second degree", | |
fr"\bcust{sep}inter{sep}2\b", | |
"custody interference second degree", | |
priority=5, | |
), | |
RegexSubstitution( | |
"abandonment", | |
fr"\babandonmnt\b", | |
"abandonment", | |
), | |
RegexSubstitution( | |
"unattended", | |
fr"\bunatt\b", | |
"unattended", | |
), | |
RegexSubstitution( | |
"endanger", | |
fr"\b(?:endngr|endgr|endang)\b", | |
"endanger", | |
), | |
RegexSubstitution( | |
"welfare", | |
fr"\b(?:wlfre|wlfr)\b", | |
"welfare", | |
), | |
RegexSubstitution( | |
"endanger welfare", | |
fr"\b(?:endngr|endgr|endang){sep}(?:wlfre|wlfr|wel)\b", | |
"endanger welfare", | |
), | |
RegexSubstitution( | |
"neglect", | |
fr"\bneglct\b", | |
"neglect", | |
), | |
RegexSubstitution( | |
"contribute", | |
fr"\bcontrib\b", | |
"contribute", | |
), | |
RegexSubstitution( | |
"delinquincy", | |
fr"\b(?:dlnqncy|delinq)\b", | |
"delinquincy", | |
), | |
RegexSubstitution( | |
"service", | |
fr"\bsrvc\b", | |
"service", | |
), | |
RegexSubstitution( | |
"misrepresentation", | |
fr"\bmisrep\b", | |
"misrepresentation", | |
), | |
RegexSubstitution( | |
"disabled", | |
fr"\bdisabld\b", | |
"disabled", | |
), | |
# === | |
RegexSubstitution( | |
"system of records exempt", | |
fr"\bsor{sep}exempt\b", | |
"system of records exempt", | |
), | |
RegexSubstitution( | |
"type", | |
r"\btyp\b", | |
"type", | |
), | |
RegexSubstitution( | |
"misconduct", | |
r"\b(?:miscond|miscon)\b", | |
"misconduct", | |
), | |
RegexSubstitution( | |
"mischief", | |
r"\bmisch\b", | |
"mischief", | |
), | |
RegexSubstitution( | |
"probation revocation", | |
fr"\bprob{sep}(?:rev|revo)\b", | |
"probation revocation", | |
), | |
RegexSubstitution( | |
"management", | |
r"\bmgmt\b", | |
"management", | |
), | |
RegexSubstitution( | |
"subsistence", | |
r"\bsubsist\b", | |
"subsistence", | |
), | |
RegexSubstitution( | |
"penalty group", | |
r"\bpg\b", | |
"penalty group", | |
), | |
RegexSubstitution( | |
"community custody", | |
r"\bcomm custody\b", | |
"community custody", | |
), | |
RegexSubstitution( | |
"contempt", | |
r"\bcntmpt\b", | |
"contempt", | |
), | |
RegexSubstitution( | |
"counterfeit", | |
r"\b(?:cntft|cntrft|cntrfeit|cnterft|contrft|contrfit)\b", | |
"counterfeit", | |
), | |
RegexSubstitution( | |
"counts", | |
r"\b(?:cts|cnts)\b", | |
"counts", | |
), | |
RegexSubstitution( | |
"victim", | |
r"\b(?:vict|vctm|vic)\b", | |
"victim", | |
), | |
# NUMBER TERMS =========== | |
RegexSubstitution("first", r"\b1st\b", "first", priority=20), | |
RegexSubstitution( | |
"first degree", fr"\b(?:first|1|1st){sep}(?:dgr|dg|de|d)\b", "first degree" | |
), | |
RegexSubstitution("first degree 2", fr"\b1dg\b", "first degree"), | |
RegexSubstitution( | |
"circumstances in the first degree", | |
fr"\bcircumstances{sep}1\b", | |
"circumstances in the first degree", | |
), | |
RegexSubstitution("second", r"\b2nd\b", "second", priority=20), | |
RegexSubstitution( | |
"second degree", fr"\b(?:second|2|2nd){sep}(?:dgr|dg|de|d)\b", "second degree" | |
), | |
RegexSubstitution( | |
"circumstances in the second degree", | |
fr"\bcircumstances{sep}2\b", | |
"circumstances in the second degree", | |
), | |
RegexSubstitution("third", r"\b3rd\b", "third", priority=20), | |
RegexSubstitution( | |
"third degree", fr"\b(?:third|3|3rd){sep}(?:dgr|dg|de|d)\b", "third degree" | |
), | |
RegexSubstitution( | |
"circumstances in the third degree", | |
fr"\bcircumstances{sep}3\b", | |
"circumstances in the third degree", | |
), | |
RegexSubstitution("fourth", r"\b4th\b", "fourth", priority=20), | |
RegexSubstitution("fifth", r"\b5th\b", "fifth", priority=20), | |
RegexSubstitution("sixth", r"\b6th\b", "sixth", priority=20), | |
RegexSubstitution("seventh", r"\b7th\b", "seventh", priority=20), | |
RegexSubstitution("eighth", r"\b8th\b", "eighth", priority=20), | |
RegexSubstitution("ninth", r"\b9th\b", "ninth", priority=20), | |
RegexSubstitution("tenth", r"\b10th\b", "tenth", priority=20), | |
# SCHEDULE terms =========== | |
# observed "l" for use of "i" across schedule terms | |
RegexSubstitution( | |
"Schedule", r"\b(?:sc?he?d?|sch|sched|schd)\b", "schedule", priority=9 | |
), | |
RegexSubstitution( | |
"schedule one", | |
fr"\bschedule{sep}(?:i|1|l)\b", | |
"schedule one", | |
), | |
RegexSubstitution( | |
"schedule two", | |
fr"\bschedule{sep}(?:ii|2|ll)\b", | |
"schedule two", | |
), | |
RegexSubstitution( | |
"schedule three", | |
fr"\bschedule{sep}(?:iii|3|lll)\b", | |
"schedule three", | |
), | |
RegexSubstitution( | |
"schedule four", | |
fr"\bschedule{sep}(?:iv|4|lv)\b", | |
"schedule four", | |
), | |
RegexSubstitution( | |
"schedule five", | |
fr"\bschedule{sep}(?:v|5)\b", | |
"schedule five", | |
), | |
RegexSubstitution( | |
"schedule six", | |
fr"\bschedule{sep}(?:vi|6|vl)\b", | |
"schedule six", | |
), | |
# DRIVING TERMS =========== | |
RegexSubstitution( | |
"driving", | |
r"\bdrvg\b", | |
"driving", | |
), | |
RegexSubstitution( | |
"driving 2", | |
fr"\bdriv{sep}g\b", | |
"driving", | |
), | |
RegexSubstitution( | |
"failure to yield", | |
fr"\bfty\b", | |
"failure to yield", | |
), | |
RegexSubstitution( | |
"permit", | |
fr"\bperm\b", | |
"permit", | |
), | |
RegexSubstitution( | |
"registration", | |
fr"\b(?:regis|registra)\b", | |
"registration", | |
), | |
RegexSubstitution( | |
"driving under the influence", | |
r"\bdui\b", | |
"driving under the influence", | |
), | |
RegexSubstitution( | |
"driving while impaired", | |
r"\bdwi\b", | |
"driving while impaired", | |
), | |
RegexSubstitution( | |
"driving while license suspended", | |
r"\bdwls\b", | |
"driving while license suspended", | |
), | |
RegexSubstitution( | |
"driving while license revoked", | |
r"\bdwlr\b", | |
"driving while license revoked", | |
), | |
RegexSubstitution( | |
"revoked", | |
r"\brevkd\b", | |
"revoked", | |
), | |
RegexSubstitution( | |
"reckless endangerment", | |
fr"\breckles{sep}endanger\b", | |
"reckless endangerment", | |
), | |
RegexSubstitution( | |
"highway", | |
fr"\bhi{sep}way\b", | |
"highway", | |
), | |
RegexSubstitution( | |
"reckless driving", | |
fr"\brek{sep}dr?\b", | |
"reckless driving", | |
), | |
# ======== | |
RegexSubstitution( | |
"retail theft", | |
fr"\bretail{sep}thft\b", | |
"retail theft", | |
), | |
RegexSubstitution( | |
"impregnate girl", | |
fr"\b(?:impregnate|impreg){sep}(?:girl|grl)\b", | |
"impregnate girl", | |
), | |
RegexSubstitution( | |
"worker compensation", | |
fr"\bwrkr{sep}cmp\b", | |
"worker compensation", | |
), | |
RegexSubstitution( | |
"disregard", | |
fr"\bdisreg\b", | |
"disregard", | |
), | |
RegexSubstitution( | |
"electrical appliance", | |
fr"\belct{sep}appl\b", | |
"electrical appliance", | |
), | |
RegexSubstitution( | |
"serial number", | |
fr"\b(?:serial|ser){sep}(?:number|nmbr|num|nu|no)\b", | |
"serial number", | |
), | |
# DISTRIBUTION / FURNISH / TRAFFICK TERMS ======= | |
RegexSubstitution( # TODO: revisit traff/traf', more likely to be traffick/ing but could be traffic (cars) | |
"traffick", | |
r"\b(?:tfk|traff|traf)\b", | |
"traffick", | |
), | |
RegexSubstitution( # TODO: revisit adding 'dist', more likely to be distribution but could be disturbance | |
"distribution", | |
r"\b(?:distr|distrib)\b", | |
"distribution", | |
), | |
RegexSubstitution( | |
"attempted distribution", | |
fr"\b(?:at|att|attempted){sep}dist\b", | |
"attempted distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"illegal distribution", | |
fr"\billgl{sep}dist\b", | |
"intent distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"buy distribute", | |
fr"\bbuy{sep}dist\b", | |
"buy distribute", | |
), | |
RegexSubstitution( | |
"intent distribute", | |
fr"\b(?:intent|int){sep}dist\b", | |
"intent distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"intent to distribute", | |
fr"\b(?:intent|int){sep}to{sep}dist\b", | |
"intent to distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribution possession", | |
fr"\bdist{sep}(?:possession|possess|poss)\b", | |
"distribution possession", | |
priority=5, | |
), | |
RegexSubstitution( | |
"unauthorized distribution", | |
fr"\b(?:unauthorized|unauth|unau|unauthd){sep}dist\b", | |
"unauthorized distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"possession distribution", | |
fr"\b(?:possession|possess|poss){sep}dist\b", | |
"possession distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"unlaw distribution", | |
fr"\b(?:unlawful|unlaw){sep}dist\b", | |
"unlawful distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribution controlled", | |
fr"\bdist{sep}(?:controlled|cntrld|cntrl|contrlld)\b", | |
"distribution controlled", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribute schedule", | |
fr"\bdist{sep}(?:schedule|sch|sched)\b", | |
"distribute schedule", | |
priority=5, | |
), | |
RegexSubstitution( | |
"furnish", | |
r"\b(?:furnishing|furn)\b", | |
"furnish", | |
), | |
RegexSubstitution( # TODO: revisit adding 'man', more likely to be manufacture/ing but could have other meaning | |
"manufacturing", | |
r"\b(?:manuf|manu|mfg|manf|manfac)\b", | |
"manufacturing", | |
), | |
RegexSubstitution( | |
"manufacturing distribution sell", | |
fr"\b(?:manuf|manu|man|mfg|manf|manfac){sep}dist{sep}sell\b", | |
"manufacturing distribution sell", | |
priority=5, | |
), | |
RegexSubstitution( | |
"record sell rent distribute", | |
fr"\brecord{sep}sell{sep}rent{sep}dist\b", | |
"record sell rent distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"sell distribute", | |
fr"\bsell{sep}dist\b", | |
"sell distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"sale distribute", | |
fr"\bsale{sep}dist\b", | |
"sale distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"offer agree to distribute", | |
fr"\boffer{sep}agree{sep}to{sep}dist\b", | |
"offer agree distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"arrange to distribute", | |
fr"\barrange{sep}to{sep}dist\b", | |
"arrange to distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"arrange to distribute 2", | |
fr"\barrange{sep}dist\b", | |
"arrange to distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"controlled substance distribution", | |
fr"\bcontr{sep}sub{sep}dist\b", | |
"controlled substance distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"manufacturing deliver distribution", | |
fr"\b(?:manuf|manu|man|mfg|manf){sep}del{sep}dist\b", | |
"manufacturing deliver distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"possession distribution manufacturing", | |
fr"\bposs{sep}dist{sep}manuf\b", | |
"possession distribution manufacturing", | |
priority=5, | |
), | |
RegexSubstitution( | |
"with intent to distribute", | |
fr"\bwitd\b", | |
"with intent to distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"possession with intent to distribute", | |
fr"\bposs{sep}(?:with|w){sep}(?:intent|int|i){sep}dist\b", | |
"possession with intent to distribute", | |
priority=5, | |
), | |
RegexSubstitution( | |
"manufacturing distribution possession", | |
fr"\b(?:manuf|manu|man|mfg|manf){sep}dist{sep}(?:p|poss|pos)\b", | |
"manufacturing distribution possession", | |
priority=5, | |
), | |
RegexSubstitution( | |
"manufacturing distribution", | |
fr"\b(?:manuf|manu|man|mfg|manf){sep}dist\b", | |
"manufacturing distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribution obscene material", | |
fr"\bdist{sep}(?:obscene|obs|obsc){sep}(?:material|mat|mtrl)\b", | |
"distribution obscene material", | |
priority=5, | |
), | |
RegexSubstitution( | |
"harmful material", | |
fr"\b(?:harmful|hrmf){sep}(?:material|mat|mtrl)\b", | |
"harmful material", | |
priority=5, | |
), | |
RegexSubstitution( | |
"obscene material distribution", | |
fr"\b(?:obscene|obs|obsc){sep}(?:material|mat|mtrl){sep}dist\b", | |
"obscene material distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"material", | |
fr"\b(?:matrl|mat|mtrl)\b", | |
"material", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribution child porn", | |
fr"\bdist{sep}child{sep}porn\b", | |
"distribution child porn", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribution controlled substances", | |
fr"\bdist{sep}cds\b", | |
"distribution controlled substances", | |
priority=5, | |
), | |
RegexSubstitution( | |
"controlled substances distribution ", | |
fr"\bcds{sep}dist\b", | |
"controlled substances distribution ", | |
priority=5, | |
), | |
RegexSubstitution( | |
"distribution narcotics", | |
fr"\bdist{sep}narc\b", | |
"distribution narcotics", | |
priority=5, | |
), | |
RegexSubstitution( | |
"deliver or distribution", | |
fr"\bdel{sep}or{sep}dist\b", | |
"deliver or distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"criminal distribution", | |
fr"\bcriminal{sep}dist\b", | |
"criminal distribution", | |
priority=5, | |
), | |
RegexSubstitution( | |
"purchase", | |
r"\bpur\b", | |
"purchase", | |
), | |
# DRUG TERMS =========== | |
RegexSubstitution( | |
"marijuana", | |
r"\b(?:marij|marihuana|mari|marijuan|marijua|mariju|mj)\b", | |
"marijuana", | |
), | |
RegexSubstitution( | |
"hydrocodone", | |
r"\bhydroc\b", | |
"hydrocodone", | |
), | |
RegexSubstitution( | |
"cocaine", | |
r"\b(?:cocain|coca|cocai|cocne)\b", | |
"cocaine", | |
), | |
RegexSubstitution( | |
"crack or cocaine", | |
r"\bcoc\b", | |
"crack or cocaine", | |
), | |
RegexSubstitution( | |
"rohypnol", | |
r"\brohypnl\b", | |
"rohypnol", | |
), | |
RegexSubstitution( | |
"heroine", | |
r"\bher\b", | |
"heroine", | |
), | |
RegexSubstitution( | |
"heroine", | |
r"\bher\b", | |
"heroine", | |
), | |
RegexSubstitution( | |
"ecstasy", | |
r"\bmdma\b", | |
"ecstasy", | |
), | |
RegexSubstitution( | |
"methamphetamine", | |
r"\b(?:meth|metham|methamphet|methamph)\b", | |
"methamphetamine", | |
), | |
RegexSubstitution( | |
"paraphernalia", | |
r"\b(?:para|paraph|paraphenalia|parap)\b", | |
"paraphernalia", | |
), | |
RegexSubstitution( | |
"grams", | |
r"\b(?:gr|gms|grms)\b", | |
"grams", | |
), | |
RegexSubstitution( | |
"gram", | |
r"\bgm\b", | |
"gram", | |
), | |
RegexSubstitution( | |
"kilograms", | |
r"\bkg\b", | |
"kilograms", | |
), | |
RegexSubstitution( | |
"pounds", | |
r"\blb\b", | |
"pounds", | |
), | |
RegexSubstitution( | |
"ounces", | |
r"\boz\b", | |
"ounces", | |
), | |
# ALCOHOL / LIQUOR terms =========== | |
RegexSubstitution( | |
"alcoholic beverage", r"\balc\Wbev\b", "alcoholic beverage", priority=5 | |
), | |
RegexSubstitution( | |
"beverage", | |
r"\bbev\b", | |
"beverage", | |
), | |
RegexSubstitution( | |
"blood alcohol concentration", | |
r"\bbac\b", | |
"blood alcohol concentration", | |
), | |
RegexSubstitution( | |
"alcohol", | |
r"\b(?:alc|alch|alchol|alcohl|alco|alcoh|alcoho)\b", | |
"alcohol", | |
), | |
RegexSubstitution( | |
"over legal", | |
fr"\b(?:over|ov){sep}(?:legal|leg)\b", | |
"over legal", | |
), | |
RegexSubstitution( | |
"supply", | |
fr"\bsupp\b", | |
"supply", | |
), | |
RegexSubstitution( | |
"liquor", | |
fr"\bliq\b", | |
"liquor", | |
), | |
RegexSubstitution( | |
"distill", | |
r"\bdstl\b", | |
"distill", | |
), | |
RegexSubstitution( | |
"minor in possession", | |
fr"\bmip\b", | |
"minor in possession", | |
), | |
RegexSubstitution( | |
"premises", | |
fr"\bprem\b", | |
"premises", | |
), | |
RegexSubstitution( | |
"consume", | |
fr"\bcnsum\b", | |
"consume", | |
), | |
RegexSubstitution( | |
"intoxication", | |
fr"\bintox\b", | |
"intoxication", | |
), | |
RegexSubstitution( | |
"available", | |
fr"\bavail\b", | |
"available", | |
), | |
RegexSubstitution( | |
"unlicensed", | |
fr"\bunlic\b", | |
"unlicensed", | |
), | |
RegexSubstitution( | |
"large amount", | |
fr"\blg{sep}amt\b", | |
"large amount", | |
), | |
RegexSubstitution( | |
"small amount", | |
fr"\bsm{sep}amt\b", | |
"small amount", | |
), | |
RegexSubstitution( | |
"required", | |
fr"\breq\b", | |
"required", | |
), | |
RegexSubstitution( | |
"violate prohibition", | |
fr"\bvio{sep}prohibition\b", | |
"violate prohibition", | |
), | |
RegexSubstitution( | |
"enticement", | |
fr"\bentcmnt\b", | |
"enticement", | |
), | |
# SUBSTANCE TERMS ======== | |
RegexSubstitution( | |
"Substance", | |
r"\b(?:sub|subs|substanc|substan|substnces|subtance|substa|substnc|sunstance|subst)\b", | |
"substance", | |
20, | |
), | |
RegexSubstitution("controlled", r"\b(?:cntrld|cntrl|contrlld)\b", "controlled", 20), | |
RegexSubstitution( | |
"controlled dangerous substances", | |
r"\bcds\b", | |
"controlled dangerous substances", | |
), | |
RegexSubstitution( | |
"solicitation of controlled substances", | |
fr"\bsol{sep}cds\b", | |
"solicitation of controlled substances", | |
priority=4, | |
), | |
RegexSubstitution( | |
"solicitation", | |
fr"\b(?:solct|sol|solicit|solic)\b", | |
"solicitation", | |
), | |
RegexSubstitution( | |
"solicitation of narcotics", | |
fr"\bsol{sep}narc\b", | |
"solicitation of narcotics", | |
priority=4, | |
), | |
RegexSubstitution( | |
"Controlled Substance", | |
fr"\bcont?r?{sep}?subs?t?(?:\b|stance\b)", | |
"controlled substance", | |
), | |
RegexSubstitution( | |
"Controlled Substance 2", | |
r"\bc\W?s\b", | |
"controlled substance", | |
), | |
RegexSubstitution( | |
"unlawful possession of a controlled substance", | |
r"\bupcs\b", | |
"unlawful possession of a controlled substance", | |
), | |
] | |
def prep_text(text): | |
# Remove Commas from Numbers | |
text = re.sub(r"(\d+?),(\d+?)", r"\1\2", text) | |
# TODO: double check this `'s` regex | |
text = re.sub(r"\b(\S+?)'(s)", r"\1\2", text) | |
# replace hyphens with spaces | |
text = re.sub("-", " ", text) | |
# replace forward-slashes with spaces | |
text = re.sub("/", " ", text) | |
return text | |
def cleaner(text): | |
if pd.isnull(text): | |
return "" | |
# Prepare text for regex substitions | |
text = prep_text(text) | |
# Do all substitutions (Case insensitive on raw text) | |
substitutions_sorted = sorted(substitutions, key=lambda s: s.priority) | |
for substitution in substitutions_sorted: | |
text = re.sub(substitution.regex, substitution.replacement, text) | |
# Remove any terms we don't want | |
for removal in removals: | |
text = re.sub(removal.regex, " ", text) | |
# Then remove remaining punctuation | |
for punct in all_punctuation: | |
text = text.replace(punct, " ") | |
text = " ".join(text.split()) # removes extra spaces: " " → " " | |
text = text.lower() | |
return text | |