Spaces:
Runtime error
Runtime error
seanpedrickcase
commited on
Commit
•
dd1cbb4
0
Parent(s):
Initial commit v1.0
Browse files- .dockerignore +15 -0
- .gitattributes +35 -0
- .github/workflows/check_file_size.yml +16 -0
- .github/workflows/sync_to_hf.yml +20 -0
- .gitignore +14 -0
- Dockerfile +43 -0
- LICENSE +201 -0
- README.md +24 -0
- app.py +447 -0
- requirements.txt +13 -0
- tools/__init__.py +0 -0
- tools/addressbase_api_funcs.py +197 -0
- tools/aws_functions.py +166 -0
- tools/constants.py +435 -0
- tools/fuzzy_match.py +437 -0
- tools/gradio.py +63 -0
- tools/matcher_funcs.py +1300 -0
- tools/model_predict.py +318 -0
- tools/preparation.py +456 -0
- tools/pytorch_models.py +155 -0
- tools/recordlinkage_funcs.py +384 -0
- tools/standardise.py +722 -0
.dockerignore
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.ipynb
|
2 |
+
*checkpoint.py
|
3 |
+
*.pyc
|
4 |
+
*.csv
|
5 |
+
*.parquet
|
6 |
+
*.pem
|
7 |
+
*.pkl
|
8 |
+
*.env
|
9 |
+
*.zip
|
10 |
+
test/*
|
11 |
+
nnet_model/*
|
12 |
+
deprecated_models/*
|
13 |
+
.ipynb_checkpoints/*
|
14 |
+
orchestration/*
|
15 |
+
.vscode/*
|
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/check_file_size.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/[email protected]
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/sync_to_hf.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://seanpedrickcase:[email protected]/spaces/seanpedrickcase/address_matcher main
|
.gitignore
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.ipynb
|
2 |
+
*checkpoint.py
|
3 |
+
*.pyc
|
4 |
+
*.csv
|
5 |
+
*.parquet
|
6 |
+
*.pem
|
7 |
+
*.pkl
|
8 |
+
*.env
|
9 |
+
*.zip
|
10 |
+
test/*
|
11 |
+
deprecated_models/*
|
12 |
+
.ipynb_checkpoints/*
|
13 |
+
orchestration/*
|
14 |
+
.vscode/*
|
Dockerfile
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM public.ecr.aws/docker/library/python:3.11.8-slim-bookworm
|
2 |
+
# FROM public.ecr.aws/docker/library/python:3.10.13-slim
|
3 |
+
|
4 |
+
WORKDIR /src
|
5 |
+
|
6 |
+
COPY requirements.txt .
|
7 |
+
|
8 |
+
RUN pip install -r requirements.txt
|
9 |
+
|
10 |
+
# Set up a new user named "user" with user ID 1000
|
11 |
+
#RUN useradd -m -u 1000 user
|
12 |
+
|
13 |
+
# Change ownership of /home/user directory
|
14 |
+
#RUN chown -R user:user /home/user
|
15 |
+
|
16 |
+
# Create the temp files directory and set its permissions
|
17 |
+
#RUN mkdir -p /home/user/tmp && chown -R user:user /home/user/tmp
|
18 |
+
|
19 |
+
# Switch to the "user" user
|
20 |
+
#USER user
|
21 |
+
|
22 |
+
# Set home to the user's home directory
|
23 |
+
ENV HOME=/home/user \
|
24 |
+
PATH=/home/user/.local/bin:$PATH \
|
25 |
+
PYTHONPATH=$HOME/app \
|
26 |
+
PYTHONUNBUFFERED=1 \
|
27 |
+
GRADIO_ALLOW_FLAGGING=never \
|
28 |
+
GRADIO_NUM_PORTS=1 \
|
29 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
30 |
+
GRADIO_SERVER_PORT=7861 \
|
31 |
+
GRADIO_THEME=huggingface \
|
32 |
+
#GRADIO_TEMP_DIR=$HOME/tmp \
|
33 |
+
#GRADIO_ROOT_PATH=/address-match \
|
34 |
+
SYSTEM=spaces
|
35 |
+
|
36 |
+
# Set the working directory to the user's home directory
|
37 |
+
WORKDIR $HOME/app
|
38 |
+
|
39 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
40 |
+
#COPY --chown=user . $HOME/app
|
41 |
+
COPY . $HOME/app
|
42 |
+
|
43 |
+
CMD ["python", "app.py"]
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Address Matching
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.20.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
# Introduction
|
16 |
+
Fuzzy matching a dataset with an LLPG dataset in the LPI format (with columns SaoText, SaoStartNumber etc.). Address columns are concatenated together to form a single string address. Important details are extracted by regex (e.g. flat, house numbers, postcodes). Addresses may be 'standardised' in a number of ways; e.g. variations of words used for 'ground floor' such as 'grd' or 'grnd' are replaced with 'ground floor' to give a more consistent address wording. This has been found to increase match rates.
|
17 |
+
|
18 |
+
Then the two datasets are compared with fuzzy matching. The closest fuzzy matches are selected, and then a post hoc test compares flat/property numbers to ensure a 'full match'.
|
19 |
+
|
20 |
+
If the LLPG reference file is in the standard LPI format, the neural net model should then initialise. This will break down the addresses to match into a list of sub address fields in the LLPG LPI format. It will then do exact or fuzzy comparisons of each address to the LLPG dataset to find closest matches. The neural net is capable of blocking on postcode and on street name, which is where most of the new matches are found according to testing.
|
21 |
+
|
22 |
+
The final files will appear in the relevant output boxes, which you can download.
|
23 |
+
|
24 |
+
|
app.py
ADDED
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load in packages, variables for fuzzy matching
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
from pathlib import Path
|
5 |
+
import time
|
6 |
+
import copy
|
7 |
+
import gradio as gr
|
8 |
+
import re
|
9 |
+
#import polars as pl
|
10 |
+
|
11 |
+
from tools.constants import *
|
12 |
+
from tools.matcher_funcs import load_matcher_data, run_match_batch, combine_two_matches, create_match_summary
|
13 |
+
from tools.gradio import initial_data_load
|
14 |
+
from tools.aws_functions import load_data_from_aws
|
15 |
+
from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, remove_non_postal, check_no_number_addresses
|
16 |
+
from tools.standardise import standardise_wrapper_func
|
17 |
+
|
18 |
+
import warnings
|
19 |
+
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
|
20 |
+
warnings.filterwarnings("ignore", 'Downcasting behavior')
|
21 |
+
warnings.filterwarnings("ignore", 'A value is trying to be set on a copy of a slice from a DataFrame')
|
22 |
+
warnings.filterwarnings("ignore")
|
23 |
+
|
24 |
+
|
25 |
+
today = datetime.now().strftime("%d%m%Y")
|
26 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
27 |
+
|
28 |
+
# Base folder is where the code file is stored
|
29 |
+
base_folder = Path(os.getcwd())
|
30 |
+
input_folder = base_folder/"Input/"
|
31 |
+
output_folder = base_folder/"Output/"
|
32 |
+
diagnostics_folder = base_folder/"Diagnostics/"
|
33 |
+
prep_folder = base_folder/"Helper functions/"
|
34 |
+
|
35 |
+
def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
|
36 |
+
#print("Search df batch size: ", batch_size)
|
37 |
+
#print("ref_df df batch size: ", ref_batch_size)
|
38 |
+
|
39 |
+
total_rows = df.shape[0]
|
40 |
+
ref_total_rows = ref_df.shape[0]
|
41 |
+
|
42 |
+
# Creating bottom and top limits for search data
|
43 |
+
search_ranges = []
|
44 |
+
for start in range(0, total_rows, batch_size):
|
45 |
+
end = min(start + batch_size - 1, total_rows - 1) # Adjusted to get the top limit
|
46 |
+
search_ranges.append((start, end))
|
47 |
+
|
48 |
+
# Creating bottom and top limits for reference data
|
49 |
+
ref_ranges = []
|
50 |
+
for start in range(0, ref_total_rows, ref_batch_size):
|
51 |
+
end = min(start + ref_batch_size - 1, ref_total_rows - 1) # Adjusted to get the top limit
|
52 |
+
ref_ranges.append((start, end))
|
53 |
+
|
54 |
+
# Create DataFrame with combinations of search_range and ref_range
|
55 |
+
result_data = []
|
56 |
+
for search_range in search_ranges:
|
57 |
+
for ref_range in ref_ranges:
|
58 |
+
result_data.append((search_range, ref_range))
|
59 |
+
|
60 |
+
range_df = pd.DataFrame(result_data, columns=['search_range', 'ref_range'])
|
61 |
+
|
62 |
+
return range_df
|
63 |
+
|
64 |
+
|
65 |
+
def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int, search_postcode_col:str, ref_postcode_col:str):
|
66 |
+
'''
|
67 |
+
Create batches of address indexes for search and reference dataframes based on shortened postcodes.
|
68 |
+
'''
|
69 |
+
|
70 |
+
# If df sizes are smaller than the batch size limits, no need to run through everything
|
71 |
+
if len(df) < batch_size and len(ref_df) < ref_batch_size:
|
72 |
+
print("Dataframe sizes are smaller than maximum batch sizes, no need to split data.")
|
73 |
+
lengths_df = pd.DataFrame(data={'search_range':[df.index.tolist()], 'ref_range':[ref_df.index.tolist()], 'batch_length':len(df), 'ref_length':len(ref_df)})
|
74 |
+
return lengths_df
|
75 |
+
|
76 |
+
#df.index = df[search_postcode_col]
|
77 |
+
|
78 |
+
df['index'] = df.index
|
79 |
+
ref_df['index'] = ref_df.index
|
80 |
+
|
81 |
+
# Remove the last character of postcode
|
82 |
+
df['postcode_minus_last_character'] = df[search_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
|
83 |
+
ref_df['postcode_minus_last_character'] = ref_df[ref_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
|
84 |
+
|
85 |
+
unique_postcodes = df['postcode_minus_last_character'][df['postcode_minus_last_character'].str.len()>=4].unique().tolist()
|
86 |
+
|
87 |
+
df = df.set_index('postcode_minus_last_character')
|
88 |
+
ref_df = ref_df.set_index('postcode_minus_last_character')
|
89 |
+
|
90 |
+
df = df.sort_index()
|
91 |
+
ref_df = ref_df.sort_index()
|
92 |
+
|
93 |
+
#df.to_csv("batch_search_df.csv")
|
94 |
+
|
95 |
+
# Overall batch variables
|
96 |
+
batch_indexes = []
|
97 |
+
ref_indexes = []
|
98 |
+
batch_lengths = []
|
99 |
+
ref_lengths = []
|
100 |
+
|
101 |
+
# Current batch variables for loop
|
102 |
+
current_batch = []
|
103 |
+
current_ref_batch = []
|
104 |
+
current_batch_length = []
|
105 |
+
current_ref_length = []
|
106 |
+
|
107 |
+
unique_postcodes_iterator = unique_postcodes.copy()
|
108 |
+
|
109 |
+
while unique_postcodes_iterator:
|
110 |
+
|
111 |
+
unique_postcodes_loop = unique_postcodes_iterator.copy()
|
112 |
+
|
113 |
+
#print("Current loop postcodes: ", unique_postcodes_loop)
|
114 |
+
|
115 |
+
for current_postcode in unique_postcodes_loop:
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
if len(current_batch) >= batch_size or len(current_ref_batch) >= ref_batch_size:
|
120 |
+
print("Batch length reached - breaking")
|
121 |
+
break
|
122 |
+
|
123 |
+
try:
|
124 |
+
current_postcode_search_data_add = df.loc[[current_postcode]]#[df['postcode_minus_last_character'].isin(current_postcode)]
|
125 |
+
current_postcode_ref_data_add = ref_df.loc[[current_postcode]]#[ref_df['postcode_minus_last_character'].isin(current_postcode)]
|
126 |
+
|
127 |
+
#print(current_postcode_search_data_add)
|
128 |
+
|
129 |
+
if not current_postcode_search_data_add.empty:
|
130 |
+
current_batch.extend(current_postcode_search_data_add['index'])
|
131 |
+
|
132 |
+
if not current_postcode_ref_data_add.empty:
|
133 |
+
current_ref_batch.extend(current_postcode_ref_data_add['index'])
|
134 |
+
|
135 |
+
except:
|
136 |
+
#print("postcode not found: ", current_postcode)
|
137 |
+
pass
|
138 |
+
|
139 |
+
unique_postcodes_iterator.remove(current_postcode)
|
140 |
+
|
141 |
+
# Append the batch data to the master lists and reset lists
|
142 |
+
batch_indexes.append(current_batch)
|
143 |
+
ref_indexes.append(current_ref_batch)
|
144 |
+
|
145 |
+
current_batch_length = len(current_batch)
|
146 |
+
current_ref_length = len(current_ref_batch)
|
147 |
+
|
148 |
+
batch_lengths.append(current_batch_length)
|
149 |
+
ref_lengths.append(current_ref_length)
|
150 |
+
|
151 |
+
current_batch = []
|
152 |
+
current_ref_batch = []
|
153 |
+
current_batch_length = []
|
154 |
+
current_ref_length = []
|
155 |
+
|
156 |
+
# Create df to store lengths
|
157 |
+
lengths_df = pd.DataFrame(data={'search_range':batch_indexes, 'ref_range':ref_indexes, 'batch_length':batch_lengths, 'ref_length':ref_lengths})
|
158 |
+
|
159 |
+
return lengths_df
|
160 |
+
|
161 |
+
|
162 |
+
def run_matcher(in_text, in_file, in_ref, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
|
163 |
+
'''
|
164 |
+
Split search and reference data into batches. Loop and run through the match script.
|
165 |
+
'''
|
166 |
+
|
167 |
+
overall_tic = time.perf_counter()
|
168 |
+
|
169 |
+
# Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
|
170 |
+
InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
|
171 |
+
|
172 |
+
if InitMatch.search_df.empty or InitMatch.ref_df.empty:
|
173 |
+
out_message = "Nothing to match!"
|
174 |
+
print(out_message)
|
175 |
+
return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
|
176 |
+
|
177 |
+
# Run initial address preparation and standardisation processes
|
178 |
+
# Prepare address format
|
179 |
+
|
180 |
+
# Polars implementation not yet finalised
|
181 |
+
#InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
|
182 |
+
#InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
|
183 |
+
|
184 |
+
|
185 |
+
# Prepare all search addresses
|
186 |
+
if type(InitMatch.search_df) == str:
|
187 |
+
InitMatch.search_df_cleaned, InitMatch.search_df_key_field, InitMatch.search_address_cols = prepare_search_address_string(InitMatch.search_df)
|
188 |
+
else:
|
189 |
+
InitMatch.search_df_cleaned = prepare_search_address(InitMatch.search_df, InitMatch.search_address_cols, InitMatch.search_postcode_col, InitMatch.search_df_key_field)
|
190 |
+
|
191 |
+
# Remove addresses that are not postal addresses
|
192 |
+
InitMatch.search_df_cleaned = remove_non_postal(InitMatch.search_df_cleaned, "full_address")
|
193 |
+
|
194 |
+
# Remove addresses that have no numbers in from consideration
|
195 |
+
InitMatch.search_df_cleaned = check_no_number_addresses(InitMatch.search_df_cleaned, "full_address")
|
196 |
+
|
197 |
+
# Initial preparation of reference addresses
|
198 |
+
InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
|
199 |
+
|
200 |
+
|
201 |
+
# Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
|
202 |
+
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
|
203 |
+
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
|
204 |
+
|
205 |
+
# Polars implementation - not finalised
|
206 |
+
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
|
207 |
+
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
|
208 |
+
|
209 |
+
# Standardise addresses
|
210 |
+
# Standardise - minimal
|
211 |
+
|
212 |
+
|
213 |
+
tic = time.perf_counter()
|
214 |
+
InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
|
215 |
+
InitMatch.search_df_cleaned.copy(),
|
216 |
+
InitMatch.ref_df_cleaned.copy(),
|
217 |
+
standardise = False,
|
218 |
+
filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
|
219 |
+
match_task="fuzzy") # InitMatch.search_df_after_stand_series, InitMatch.ref_df_after_stand_series
|
220 |
+
|
221 |
+
toc = time.perf_counter()
|
222 |
+
print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
|
223 |
+
|
224 |
+
# Standardise - full
|
225 |
+
tic = time.perf_counter()
|
226 |
+
InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
|
227 |
+
InitMatch.search_df_cleaned.copy(),
|
228 |
+
InitMatch.ref_df_cleaned.copy(),
|
229 |
+
standardise = True,
|
230 |
+
filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
|
231 |
+
match_task="fuzzy") # , InitMatch.search_df_after_stand_series_full_stand, InitMatch.ref_df_after_stand_series_full_stand
|
232 |
+
|
233 |
+
toc = time.perf_counter()
|
234 |
+
print(f"Performed the full standardisation step in {toc - tic:0.1f} seconds")
|
235 |
+
|
236 |
+
# Determine length of search df to create batches to send through the functions.
|
237 |
+
#try:
|
238 |
+
range_df = create_batch_ranges(InitMatch.search_df_cleaned.copy(), InitMatch.ref_df_cleaned.copy(), batch_size, ref_batch_size, "postcode", "Postcode")
|
239 |
+
#except:
|
240 |
+
# range_df = create_simple_batch_ranges(InitMatch.search_df_cleaned, InitMatch.ref_df_cleaned, batch_size, #ref_batch_size)
|
241 |
+
|
242 |
+
print("Batches to run in this session: ", range_df)
|
243 |
+
|
244 |
+
OutputMatch = copy.copy(InitMatch)
|
245 |
+
|
246 |
+
n = 0
|
247 |
+
number_of_batches = range_df.shape[0]
|
248 |
+
|
249 |
+
for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
|
250 |
+
print("Running batch ", str(n+1))
|
251 |
+
|
252 |
+
search_range = range_df.iloc[row]['search_range']
|
253 |
+
ref_range = range_df.iloc[row]['ref_range']
|
254 |
+
|
255 |
+
#print("search_range: ", search_range)
|
256 |
+
#pd.DataFrame(search_range).to_csv("search_range.csv")
|
257 |
+
#print("ref_range: ", ref_range)
|
258 |
+
|
259 |
+
BatchMatch = copy.copy(InitMatch)
|
260 |
+
|
261 |
+
# Subset the search and reference dfs based on current batch ranges
|
262 |
+
# BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
263 |
+
# BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
|
264 |
+
# BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
265 |
+
# BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
266 |
+
# BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
267 |
+
|
268 |
+
|
269 |
+
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
|
270 |
+
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
|
271 |
+
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
|
272 |
+
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
|
273 |
+
|
274 |
+
# BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
275 |
+
# BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
276 |
+
# BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
277 |
+
# BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
278 |
+
|
279 |
+
BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
|
280 |
+
BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
|
281 |
+
BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
|
282 |
+
|
283 |
+
BatchMatch.ref_df = BatchMatch.ref_df[BatchMatch.ref_df.index.isin(ref_range)].reset_index(drop=True)
|
284 |
+
BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned[BatchMatch.ref_df_cleaned.index.isin(ref_range)].reset_index(drop=True)
|
285 |
+
|
286 |
+
# Dataframes after standardisation process
|
287 |
+
BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand[BatchMatch.search_df_after_stand.index.isin(search_range)].reset_index(drop=True)
|
288 |
+
BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
|
289 |
+
|
290 |
+
### Create lookup lists for fuzzy matches
|
291 |
+
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
|
292 |
+
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
|
293 |
+
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
|
294 |
+
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
|
295 |
+
|
296 |
+
#BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
|
297 |
+
#BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
|
298 |
+
|
299 |
+
BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
|
300 |
+
BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
|
301 |
+
|
302 |
+
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
|
303 |
+
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
|
304 |
+
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
|
305 |
+
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
|
306 |
+
|
307 |
+
# BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
|
308 |
+
# BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
|
309 |
+
|
310 |
+
# Match the data, unless the search or reference dataframes are empty
|
311 |
+
if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
|
312 |
+
out_message = "Nothing to match for batch: " + str(n)
|
313 |
+
print(out_message)
|
314 |
+
BatchMatch_out = BatchMatch
|
315 |
+
BatchMatch_out.results_on_orig_df = pd.DataFrame(data={"index":BatchMatch.search_df.index,
|
316 |
+
"Excluded from search":False,
|
317 |
+
"Matched with reference address":False})
|
318 |
+
else:
|
319 |
+
summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
|
320 |
+
|
321 |
+
OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
|
322 |
+
|
323 |
+
n += 1
|
324 |
+
|
325 |
+
if in_api==True:
|
326 |
+
OutputMatch.results_on_orig_df['Matched with reference address'] = OutputMatch.results_on_orig_df['Matched with reference address'].replace({1:True, 0:False})
|
327 |
+
OutputMatch.results_on_orig_df['Excluded from search'] = OutputMatch.results_on_orig_df['Excluded from search'].replace('nan', False).fillna(False)
|
328 |
+
|
329 |
+
# Remove any duplicates from reference df, prioritise successful matches
|
330 |
+
OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
331 |
+
|
332 |
+
|
333 |
+
overall_toc = time.perf_counter()
|
334 |
+
time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
|
335 |
+
|
336 |
+
print(OutputMatch.output_summary)
|
337 |
+
|
338 |
+
if OutputMatch.output_summary == "":
|
339 |
+
OutputMatch.output_summary = "No matches were found."
|
340 |
+
|
341 |
+
fuzzy_not_std_output = OutputMatch.match_results_output.copy()
|
342 |
+
fuzzy_not_std_output_mask = ~(fuzzy_not_std_output["match_method"].str.contains("Fuzzy match")) | (fuzzy_not_std_output["standardised_address"] == True)
|
343 |
+
fuzzy_not_std_output.loc[fuzzy_not_std_output_mask, "full_match"] = False
|
344 |
+
fuzzy_not_std_summary = create_match_summary(fuzzy_not_std_output, "Fuzzy not standardised")
|
345 |
+
|
346 |
+
fuzzy_std_output = OutputMatch.match_results_output.copy()
|
347 |
+
fuzzy_std_output_mask = fuzzy_std_output["match_method"].str.contains("Fuzzy match")
|
348 |
+
fuzzy_std_output.loc[fuzzy_std_output_mask == False, "full_match"] = False
|
349 |
+
fuzzy_std_summary = create_match_summary(fuzzy_std_output, "Fuzzy standardised")
|
350 |
+
|
351 |
+
nnet_std_output = OutputMatch.match_results_output.copy()
|
352 |
+
nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
|
353 |
+
|
354 |
+
final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
|
355 |
+
|
356 |
+
return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
|
357 |
+
|
358 |
+
# Create the gradio interface
|
359 |
+
|
360 |
+
block = gr.Blocks(theme = gr.themes.Base())
|
361 |
+
|
362 |
+
with block:
|
363 |
+
|
364 |
+
data_state = gr.State(pd.DataFrame())
|
365 |
+
ref_data_state = gr.State(pd.DataFrame())
|
366 |
+
results_data_state = gr.State(pd.DataFrame())
|
367 |
+
ref_results_data_state =gr.State(pd.DataFrame())
|
368 |
+
|
369 |
+
gr.Markdown(
|
370 |
+
"""
|
371 |
+
# Address matcher
|
372 |
+
Match single or multiple addresses to the reference address file of your choice. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already.The neural network component works with LLPG files in the LPI format.
|
373 |
+
|
374 |
+
The tool can accept csv, xlsx (with one sheet), and parquet files. You
|
375 |
+
need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
|
376 |
+
|
377 |
+
Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.
|
378 |
+
""")
|
379 |
+
|
380 |
+
with gr.Tab("Match addresses"):
|
381 |
+
|
382 |
+
with gr.Accordion("I have multiple addresses", open = True):
|
383 |
+
in_file = gr.File(label="Input addresses from file", file_count= "multiple")
|
384 |
+
in_colnames = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
|
385 |
+
in_existing = gr.Dropdown(choices=[], multiselect=False, label="Select columns that indicate existing matches.")
|
386 |
+
|
387 |
+
with gr.Accordion("I only have a single address", open = False):
|
388 |
+
in_text = gr.Textbox(label="Input a single address as text")
|
389 |
+
|
390 |
+
gr.Markdown(
|
391 |
+
"""
|
392 |
+
## Choose reference file
|
393 |
+
Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
|
394 |
+
open 'Custom reference file format or join columns' below.
|
395 |
+
""")
|
396 |
+
|
397 |
+
in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
|
398 |
+
|
399 |
+
with gr.Accordion("Use Addressbase API instead of reference file", open = False):
|
400 |
+
in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
|
401 |
+
in_api_key = gr.Textbox(label="Addressbase API key")
|
402 |
+
|
403 |
+
with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
|
404 |
+
in_refcol = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
|
405 |
+
in_joincol = gr.Dropdown(choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")
|
406 |
+
|
407 |
+
match_btn = gr.Button("Match addresses")
|
408 |
+
|
409 |
+
with gr.Row():
|
410 |
+
output_summary = gr.Textbox(label="Output summary")
|
411 |
+
output_file = gr.File(label="Output file")
|
412 |
+
|
413 |
+
with gr.Tab(label="Advanced options"):
|
414 |
+
with gr.Accordion(label = "AWS data access", open = False):
|
415 |
+
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
416 |
+
with gr.Row():
|
417 |
+
in_aws_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth address data example file"])
|
418 |
+
load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
|
419 |
+
|
420 |
+
aws_log_box = gr.Textbox(label="AWS data load status")
|
421 |
+
|
422 |
+
|
423 |
+
### Loading AWS data ###
|
424 |
+
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_ref, aws_log_box])
|
425 |
+
|
426 |
+
|
427 |
+
# Updates to components
|
428 |
+
in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_colnames, in_existing, data_state, results_data_state])
|
429 |
+
in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
|
430 |
+
|
431 |
+
match_btn.click(fn = run_matcher, inputs=[in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, in_api, in_api_key],
|
432 |
+
outputs=[output_summary, output_file], api_name="address")
|
433 |
+
|
434 |
+
# Simple run for HF spaces or local on your computer
|
435 |
+
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
436 |
+
|
437 |
+
# Simple run for AWS server
|
438 |
+
block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
|
439 |
+
|
440 |
+
# Download OpenSSL from here:
|
441 |
+
# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
|
442 |
+
#block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
|
443 |
+
# ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
|
444 |
+
|
445 |
+
# Running on local server without https
|
446 |
+
#block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)
|
447 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#fuzzywuzzy==0.18.0
|
2 |
+
numpy==1.26.2
|
3 |
+
pandas==2.2.1
|
4 |
+
rapidfuzz==3.8.1
|
5 |
+
torch==2.2.1
|
6 |
+
recordlinkage==0.16
|
7 |
+
pyap==0.3.1
|
8 |
+
pytest==7.4.3
|
9 |
+
pyarrow==14.0.1
|
10 |
+
openpyxl==3.1.2
|
11 |
+
gradio==4.20.1
|
12 |
+
boto3==1.34.63
|
13 |
+
polars==0.20.19
|
tools/__init__.py
ADDED
File without changes
|
tools/addressbase_api_funcs.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import urllib
|
3 |
+
from datetime import datetime
|
4 |
+
import pandas as pd
|
5 |
+
import time
|
6 |
+
import requests
|
7 |
+
|
8 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
9 |
+
|
10 |
+
|
11 |
+
# url = 'https://api.os.uk/search/places/v1/uprn?%s'
|
12 |
+
# params = urllib.parse.urlencode({'uprn':<UPRN>,'dataset':'LPI', 'key':os.environ["ADDRESSBASE_API_KEY"]})
|
13 |
+
|
14 |
+
# Places API
|
15 |
+
# Technical guide: https://osdatahub.os.uk/docs/places/technicalSpecification
|
16 |
+
|
17 |
+
|
18 |
+
def places_api_query(query, api_key, query_type):
|
19 |
+
|
20 |
+
def make_api_call(url):
|
21 |
+
max_retries = 3
|
22 |
+
retries = 0
|
23 |
+
|
24 |
+
while retries < max_retries:
|
25 |
+
try:
|
26 |
+
response = requests.get(url)
|
27 |
+
if response.status_code == 200:
|
28 |
+
# If successful response, return the response
|
29 |
+
return response
|
30 |
+
elif response.status_code == 429:
|
31 |
+
# If rate limited, wait for 5 seconds before retrying
|
32 |
+
print("Rate limited. Retrying in 5 seconds...")
|
33 |
+
time.sleep(3)
|
34 |
+
retries += 1
|
35 |
+
else:
|
36 |
+
# For other errors, return the response
|
37 |
+
return response
|
38 |
+
except Exception as e:
|
39 |
+
print("Error:", str(e))
|
40 |
+
retries += 1
|
41 |
+
|
42 |
+
# If maximum retries reached, return None
|
43 |
+
return None
|
44 |
+
|
45 |
+
if api_key:
|
46 |
+
|
47 |
+
overall_tic = time.perf_counter()
|
48 |
+
|
49 |
+
#filter_code_lsc = "LOGICAL_STATUS_CODE:1"
|
50 |
+
filter_code_lpi_lsc ="LPI_LOGICAL_STATUS_CODE:1"
|
51 |
+
concat_results = []
|
52 |
+
|
53 |
+
if query_type == "Address":
|
54 |
+
url = 'https://api.os.uk/search/places/v1/find?%s'
|
55 |
+
params = urllib.parse.urlencode({'query':query,
|
56 |
+
'dataset':'LPI',
|
57 |
+
'key':api_key,
|
58 |
+
"maxresults" : 20,
|
59 |
+
'minmatch':0.70, # This includes partial matches
|
60 |
+
'matchprecision':2,
|
61 |
+
'fq':filter_code_lpi_lsc,
|
62 |
+
'lr':'EN'})
|
63 |
+
|
64 |
+
try:
|
65 |
+
request_text = url % params
|
66 |
+
#print(request_text)
|
67 |
+
response = make_api_call(request_text)
|
68 |
+
except Exception as e:
|
69 |
+
print(str(e))
|
70 |
+
|
71 |
+
|
72 |
+
if response is not None:
|
73 |
+
if response.status_code == 200:
|
74 |
+
# Process the response
|
75 |
+
print("Successful response")
|
76 |
+
#print("Successful response:", response.json())
|
77 |
+
else:
|
78 |
+
print("Error:", response.status_code)
|
79 |
+
|
80 |
+
else:
|
81 |
+
print("Maximum retries reached. Error occurred.")
|
82 |
+
return pd.DataFrame() # Return blank dataframe
|
83 |
+
|
84 |
+
# Load JSON response
|
85 |
+
response_data = response.json()
|
86 |
+
|
87 |
+
# Extract 'results' part
|
88 |
+
try:
|
89 |
+
results = response_data['results']
|
90 |
+
concat_results.extend(results)
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
print(str(e))
|
94 |
+
return pd.DataFrame() # Return blank dataframe
|
95 |
+
|
96 |
+
# If querying postcode, need to use pagination and postcode API
|
97 |
+
elif query_type == "Postcode":
|
98 |
+
|
99 |
+
max_results_requested = 100
|
100 |
+
remaining_calls = 1
|
101 |
+
totalresults = max_results_requested
|
102 |
+
call_number = 1
|
103 |
+
|
104 |
+
while remaining_calls > 0 and call_number <= 10:
|
105 |
+
|
106 |
+
offset = (call_number-1) * max_results_requested
|
107 |
+
|
108 |
+
#print("Remaining to query:", remaining_calls)
|
109 |
+
|
110 |
+
|
111 |
+
url = 'https://api.os.uk/search/places/v1/postcode?%s'
|
112 |
+
params = urllib.parse.urlencode({'postcode':query,
|
113 |
+
'dataset':'LPI',
|
114 |
+
'key':api_key,
|
115 |
+
"maxresults" : max_results_requested,
|
116 |
+
'offset':offset,
|
117 |
+
#'fq':filter_code_lsc,
|
118 |
+
'fq':filter_code_lpi_lsc,
|
119 |
+
'lr':'EN'})
|
120 |
+
|
121 |
+
try:
|
122 |
+
request_text = url % params
|
123 |
+
#print(request_text)
|
124 |
+
response = make_api_call(request_text)
|
125 |
+
except Exception as e:
|
126 |
+
print(str(e))
|
127 |
+
|
128 |
+
if response is not None:
|
129 |
+
if response.status_code == 200:
|
130 |
+
totalresults = response.json()['header']['totalresults']
|
131 |
+
|
132 |
+
print("Successful response")
|
133 |
+
print("Total results:", totalresults)
|
134 |
+
|
135 |
+
remaining_calls = totalresults - (max_results_requested * call_number)
|
136 |
+
|
137 |
+
call_number += 1
|
138 |
+
|
139 |
+
# Concat results together
|
140 |
+
try:
|
141 |
+
results = response.json()['results']
|
142 |
+
concat_results.extend(results)
|
143 |
+
except Exception as e:
|
144 |
+
print("Result concat failed with error: ", str(e))
|
145 |
+
concat_results.append({"invalid_request":True, "POSTCODE_LOCATOR": query})
|
146 |
+
|
147 |
+
else:
|
148 |
+
print("Error:", response.status_code, "For postcode: ", query, " With query: ", request_text)
|
149 |
+
concat_results.append({"invalid_request":True, "POSTCODE_LOCATOR": query})
|
150 |
+
return pd.DataFrame(data={"invalid_request":[True], "POSTCODE_LOCATOR": [query]},index=[0]) # Return blank dataframe
|
151 |
+
else:
|
152 |
+
print("Maximum retries reached. Error occurred.")
|
153 |
+
return pd.DataFrame() # Return blank dataframe
|
154 |
+
|
155 |
+
else:
|
156 |
+
print("No API key provided.")
|
157 |
+
return pd.DataFrame() # Return blank dataframe
|
158 |
+
|
159 |
+
#print('RESPONSE:', concat_results)
|
160 |
+
|
161 |
+
|
162 |
+
# Convert 'results' to DataFrame
|
163 |
+
|
164 |
+
# Check if 'LPI' sub-branch exists in the JSON response
|
165 |
+
#print(concat_results)
|
166 |
+
|
167 |
+
if 'LPI' in concat_results[-1]:
|
168 |
+
#print("LPI in result columns")
|
169 |
+
df = pd.json_normalize(concat_results)
|
170 |
+
df.rename(columns=lambda x: x.replace('LPI.', ''), inplace=True)
|
171 |
+
else:
|
172 |
+
# Normalize the entire JSON data if 'LPI' sub-branch doesn't exist
|
173 |
+
df = pd.json_normalize(concat_results)
|
174 |
+
|
175 |
+
|
176 |
+
# Ensure df is a DataFrame, even if it has a single row
|
177 |
+
if isinstance(df, pd.Series):
|
178 |
+
print("This is a series!")
|
179 |
+
df = df.to_frame().T # Convert the Series to a DataFrame with a single row
|
180 |
+
# if isinstance(df, pd.DataFrame):
|
181 |
+
# print("This is a dataframe!")
|
182 |
+
# else:
|
183 |
+
# print("This is not a dataframe!")
|
184 |
+
# return pd.DataFrame() # Return blank dataframe
|
185 |
+
|
186 |
+
|
187 |
+
print(df)
|
188 |
+
#print(df.columns)
|
189 |
+
#df.to_csv(query + ".csv")
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
overall_toc = time.perf_counter()
|
194 |
+
time_out = f"The API call took {overall_toc - overall_tic:0.1f} seconds"
|
195 |
+
print(time_out)
|
196 |
+
|
197 |
+
return df
|
tools/aws_functions.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type
|
2 |
+
import pandas as pd
|
3 |
+
import boto3
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
|
7 |
+
PandasDataFrame = Type[pd.DataFrame]
|
8 |
+
|
9 |
+
bucket_name = 'address-matcher-data'
|
10 |
+
|
11 |
+
try:
|
12 |
+
session = boto3.Session(profile_name="default")
|
13 |
+
except Exception as e:
|
14 |
+
print(e)
|
15 |
+
|
16 |
+
# sts = session.client("sts")
|
17 |
+
# Create a Session with the IAM role ARN
|
18 |
+
# aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
|
19 |
+
# response = sts.assume_role(
|
20 |
+
# RoleArn=aws_role,
|
21 |
+
# RoleSessionName="ecs-test-session"
|
22 |
+
# )
|
23 |
+
# print(response)
|
24 |
+
|
25 |
+
|
26 |
+
def get_assumed_role_info():
|
27 |
+
sts = boto3.client('sts')
|
28 |
+
response = sts.get_caller_identity()
|
29 |
+
|
30 |
+
# Extract ARN of the assumed role
|
31 |
+
assumed_role_arn = response['Arn']
|
32 |
+
|
33 |
+
# Extract the name of the assumed role from the ARN
|
34 |
+
assumed_role_name = assumed_role_arn.split('/')[-1]
|
35 |
+
|
36 |
+
return assumed_role_arn, assumed_role_name
|
37 |
+
|
38 |
+
try:
|
39 |
+
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
40 |
+
|
41 |
+
print("Assumed Role ARN:", assumed_role_arn)
|
42 |
+
print("Assumed Role Name:", assumed_role_name)
|
43 |
+
except Exception as e:
|
44 |
+
print(e)
|
45 |
+
|
46 |
+
# Download direct from S3 - requires login credentials
|
47 |
+
def download_file_from_s3(bucket_name, key, local_file_path):
|
48 |
+
|
49 |
+
s3 = boto3.client('s3')
|
50 |
+
s3.download_file(bucket_name, key, local_file_path)
|
51 |
+
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
|
52 |
+
|
53 |
+
#download_file_from_s3(bucket_name, object_key, local_file_loc)
|
54 |
+
|
55 |
+
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
56 |
+
"""
|
57 |
+
Download all files from an S3 folder to a local folder.
|
58 |
+
"""
|
59 |
+
s3 = boto3.client('s3')
|
60 |
+
|
61 |
+
# List objects in the specified S3 folder
|
62 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
63 |
+
|
64 |
+
# Download each object
|
65 |
+
for obj in response.get('Contents', []):
|
66 |
+
# Extract object key and construct local file path
|
67 |
+
object_key = obj['Key']
|
68 |
+
local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
|
69 |
+
|
70 |
+
# Create directories if necessary
|
71 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
72 |
+
|
73 |
+
# Download the object
|
74 |
+
try:
|
75 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
76 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
77 |
+
except Exception as e:
|
78 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
79 |
+
|
80 |
+
|
81 |
+
def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
82 |
+
"""
|
83 |
+
Download specific files from an S3 folder to a local folder.
|
84 |
+
"""
|
85 |
+
s3 = boto3.client('s3')
|
86 |
+
|
87 |
+
print("Trying to download file: ", filenames)
|
88 |
+
|
89 |
+
if filenames == '*':
|
90 |
+
# List all objects in the S3 folder
|
91 |
+
print("Trying to download all files in AWS folder: ", s3_folder)
|
92 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
93 |
+
|
94 |
+
print("Found files in AWS folder: ", response.get('Contents', []))
|
95 |
+
|
96 |
+
filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
|
97 |
+
|
98 |
+
print("Found filenames in AWS folder: ", filenames)
|
99 |
+
|
100 |
+
for filename in filenames:
|
101 |
+
object_key = os.path.join(s3_folder, filename)
|
102 |
+
local_file_path = os.path.join(local_folder, filename)
|
103 |
+
|
104 |
+
# Create directories if necessary
|
105 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
106 |
+
|
107 |
+
# Download the object
|
108 |
+
try:
|
109 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
110 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
111 |
+
except Exception as e:
|
112 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
117 |
+
|
118 |
+
temp_dir = tempfile.mkdtemp()
|
119 |
+
local_address_stub = temp_dir + '/address-match/'
|
120 |
+
files = []
|
121 |
+
|
122 |
+
if not 'LAMBETH_ADDRESS_PASSWORD' in os.environ:
|
123 |
+
out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
|
124 |
+
return files, out_message
|
125 |
+
|
126 |
+
if aws_password:
|
127 |
+
if "Lambeth address data example file" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_ADDRESS_PASSWORD']:
|
128 |
+
|
129 |
+
s3_folder_stub = 'example-data/lambeth-address-data/latest/'
|
130 |
+
|
131 |
+
local_folder_path = local_address_stub
|
132 |
+
|
133 |
+
# Check if folder exists
|
134 |
+
if not os.path.exists(local_folder_path):
|
135 |
+
print(f"Folder {local_folder_path} does not exist! Making folder.")
|
136 |
+
|
137 |
+
os.mkdir(local_folder_path)
|
138 |
+
|
139 |
+
# Check if folder is empty
|
140 |
+
if len(os.listdir(local_folder_path)) == 0:
|
141 |
+
print(f"Folder {local_folder_path} is empty")
|
142 |
+
# Download data
|
143 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
|
144 |
+
|
145 |
+
print("AWS data downloaded")
|
146 |
+
|
147 |
+
else:
|
148 |
+
print(f"Folder {local_folder_path} is not empty")
|
149 |
+
|
150 |
+
#files = os.listdir(local_folder_stub)
|
151 |
+
#print(files)
|
152 |
+
|
153 |
+
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
154 |
+
|
155 |
+
out_message = "Data successfully loaded from AWS"
|
156 |
+
print(out_message)
|
157 |
+
|
158 |
+
else:
|
159 |
+
out_message = "Data not loaded from AWS"
|
160 |
+
print(out_message)
|
161 |
+
else:
|
162 |
+
out_message = "No password provided. Please ask the data team for access if you need this."
|
163 |
+
print(out_message)
|
164 |
+
|
165 |
+
return files, out_message
|
166 |
+
|
tools/constants.py
ADDED
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import pickle
|
4 |
+
import torch
|
5 |
+
import zipfile
|
6 |
+
from typing import List, Union, Type, Dict
|
7 |
+
from pydantic import BaseModel
|
8 |
+
|
9 |
+
from .pytorch_models import *
|
10 |
+
|
11 |
+
PandasDataFrame = Type[pd.DataFrame]
|
12 |
+
PandasSeries = Type[pd.Series]
|
13 |
+
|
14 |
+
# +
|
15 |
+
''' Fuzzywuzzy/Rapidfuzz scorer to use. Options are: ratio, partial_ratio, token_sort_ratio, partial_token_sort_ratio,
|
16 |
+
token_set_ratio, partial_token_set_ratio, QRatio, UQRatio, WRatio (default), UWRatio
|
17 |
+
details here: https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings'''
|
18 |
+
|
19 |
+
fuzzy_scorer_used = "token_set_ratio"
|
20 |
+
|
21 |
+
# +
|
22 |
+
fuzzy_match_limit = 85
|
23 |
+
|
24 |
+
fuzzy_search_addr_limit = 20
|
25 |
+
|
26 |
+
filter_to_lambeth_pcodes= True
|
27 |
+
# -
|
28 |
+
|
29 |
+
standardise = False
|
30 |
+
|
31 |
+
# +
|
32 |
+
if standardise == True:
|
33 |
+
std = "_std"
|
34 |
+
if standardise == False:
|
35 |
+
std = "_not_std"
|
36 |
+
|
37 |
+
dataset_name = "data" + std
|
38 |
+
|
39 |
+
suffix_used = dataset_name + "_" + fuzzy_scorer_used
|
40 |
+
|
41 |
+
# https://stackoverflow.com/questions/59221557/tensorflow-v2-replacement-for-tf-contrib-predictor-from-saved-model
|
42 |
+
|
43 |
+
ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
|
44 |
+
print(ROOT_DIR)
|
45 |
+
|
46 |
+
# Uncomment these lines for the tensorflow model
|
47 |
+
#model_type = "tf"
|
48 |
+
#model_stub = "addr_model_out_lon"
|
49 |
+
#model_version = "00000001"
|
50 |
+
#file_step_suffix = "550" # I add a suffix to output files to be able to separate comparisons of test data from the same model with different steps e.g. '350' indicates a model that has been through 350,000 steps of training
|
51 |
+
|
52 |
+
# Uncomment these lines for the pytorch model
|
53 |
+
model_type = "lstm"
|
54 |
+
model_stub = "pytorch/lstm"
|
55 |
+
model_version = ""
|
56 |
+
file_step_suffix = ""
|
57 |
+
data_sample_size = 476887
|
58 |
+
N_EPOCHS = 10
|
59 |
+
max_predict_len = 12000
|
60 |
+
|
61 |
+
word_to_index = {}
|
62 |
+
cat_to_idx = {}
|
63 |
+
vocab = []
|
64 |
+
device = "cpu"
|
65 |
+
|
66 |
+
global labels_list
|
67 |
+
labels_list = []
|
68 |
+
|
69 |
+
model_dir_name = os.path.join(ROOT_DIR, "nnet_model" , model_stub , model_version)
|
70 |
+
print(model_dir_name)
|
71 |
+
|
72 |
+
model_path = os.path.join(model_dir_name, "saved_model.zip")
|
73 |
+
print("model path: ")
|
74 |
+
print(model_path)
|
75 |
+
|
76 |
+
if os.path.exists(model_path):
|
77 |
+
|
78 |
+
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Better to go without GPU to avoid 'out of memory' issues
|
79 |
+
device = "cpu"
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
## The labels_list object defines the structure of the prediction outputs. It must be the same as what the model was originally trained on
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
''' Load pre-trained model '''
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
with zipfile.ZipFile(model_path,"r") as zip_ref:
|
92 |
+
zip_ref.extractall(model_dir_name)
|
93 |
+
|
94 |
+
# if model_stub == "addr_model_out_lon":
|
95 |
+
|
96 |
+
#import tensorflow as tf
|
97 |
+
|
98 |
+
#tf.config.list_physical_devices('GPU')
|
99 |
+
|
100 |
+
# # Number of labels in total (+1 for the blank category)
|
101 |
+
# n_labels = len(labels_list) + 1
|
102 |
+
|
103 |
+
# # Allowable characters for the encoded representation
|
104 |
+
# vocab = list(string.digits + string.ascii_lowercase + string.punctuation + string.whitespace)
|
105 |
+
|
106 |
+
# #print("Loading TF model")
|
107 |
+
|
108 |
+
# exported_model = tf.saved_model.load(model_dir_name)
|
109 |
+
|
110 |
+
# labels_list = [
|
111 |
+
# 'SaoText', # 1
|
112 |
+
# 'SaoStartNumber', # 2
|
113 |
+
# 'SaoStartSuffix', # 3
|
114 |
+
# 'SaoEndNumber', # 4
|
115 |
+
# 'SaoEndSuffix', # 5
|
116 |
+
# 'PaoText', # 6
|
117 |
+
# 'PaoStartNumber', # 7
|
118 |
+
# 'PaoStartSuffix', # 8
|
119 |
+
# 'PaoEndNumber', # 9
|
120 |
+
# 'PaoEndSuffix', # 10
|
121 |
+
# 'Street', # 11
|
122 |
+
# 'PostTown', # 12
|
123 |
+
# 'AdministrativeArea', #13
|
124 |
+
# 'Postcode' # 14
|
125 |
+
# ]
|
126 |
+
|
127 |
+
if "pytorch" in model_stub:
|
128 |
+
|
129 |
+
labels_list = [
|
130 |
+
'SaoText', # 1
|
131 |
+
'SaoStartNumber', # 2
|
132 |
+
'SaoStartSuffix', # 3
|
133 |
+
'SaoEndNumber', # 4
|
134 |
+
'SaoEndSuffix', # 5
|
135 |
+
'PaoText', # 6
|
136 |
+
'PaoStartNumber', # 7
|
137 |
+
'PaoStartSuffix', # 8
|
138 |
+
'PaoEndNumber', # 9
|
139 |
+
'PaoEndSuffix', # 10
|
140 |
+
'Street', # 11
|
141 |
+
'PostTown', # 12
|
142 |
+
'AdministrativeArea', #13
|
143 |
+
'Postcode', # 14
|
144 |
+
'IGNORE'
|
145 |
+
]
|
146 |
+
|
147 |
+
#labels_list.to_csv("labels_list.csv", index = None)
|
148 |
+
|
149 |
+
if (model_type == "transformer") | (model_type == "gru") | (model_type == "lstm") :
|
150 |
+
# Load vocab and word_to_index
|
151 |
+
with open(model_dir_name + "vocab.txt", "r") as f:
|
152 |
+
vocab = eval(f.read())
|
153 |
+
with open(model_dir_name + "/word_to_index.txt", "r") as f:
|
154 |
+
word_to_index = eval(f.read())
|
155 |
+
with open(model_dir_name + "/cat_to_idx.txt", "r") as f:
|
156 |
+
cat_to_idx = eval(f.read())
|
157 |
+
|
158 |
+
VOCAB_SIZE = len(word_to_index)
|
159 |
+
OUTPUT_DIM = len(cat_to_idx) + 1 # Number of classes/categories
|
160 |
+
EMBEDDING_DIM = 48
|
161 |
+
DROPOUT = 0.1
|
162 |
+
PAD_TOKEN = 0
|
163 |
+
|
164 |
+
|
165 |
+
if model_type == "transformer":
|
166 |
+
NHEAD = 4
|
167 |
+
NUM_ENCODER_LAYERS = 1
|
168 |
+
|
169 |
+
exported_model = TransformerClassifier(VOCAB_SIZE, EMBEDDING_DIM, NHEAD, NUM_ENCODER_LAYERS, OUTPUT_DIM, DROPOUT, PAD_TOKEN)
|
170 |
+
|
171 |
+
elif model_type == "gru":
|
172 |
+
N_LAYERS = 3
|
173 |
+
HIDDEN_DIM = 128
|
174 |
+
exported_model = TextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN)
|
175 |
+
|
176 |
+
elif model_type == "lstm":
|
177 |
+
N_LAYERS = 3
|
178 |
+
HIDDEN_DIM = 128
|
179 |
+
|
180 |
+
exported_model = LSTMTextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN)
|
181 |
+
|
182 |
+
|
183 |
+
exported_model.load_state_dict(torch.load(model_dir_name + "output_model_" + str(data_sample_size) +\
|
184 |
+
"_" + str(N_EPOCHS) + "_" + model_type + ".pth", map_location=torch.device('cpu')))
|
185 |
+
exported_model.eval()
|
186 |
+
|
187 |
+
device='cpu'
|
188 |
+
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
189 |
+
exported_model.to(device)
|
190 |
+
|
191 |
+
|
192 |
+
else:
|
193 |
+
exported_model = [] #tf.keras.models.load_model(model_dir_name, compile=False)
|
194 |
+
# Compile the model with a loss function and an optimizer
|
195 |
+
#exported_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
|
196 |
+
|
197 |
+
else: exported_model = []
|
198 |
+
|
199 |
+
#if exported_model:
|
200 |
+
# exported_model = exported_model
|
201 |
+
#else: exported_model = []
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
# +
|
206 |
+
# Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
|
207 |
+
batch_size = 10000
|
208 |
+
ref_batch_size = 150000
|
209 |
+
|
210 |
+
### Fuzzy match method
|
211 |
+
|
212 |
+
''' https://recordlinkage.readthedocs.io/en/latest/ref_df-compare.html#recordlinkage.compare.String
|
213 |
+
The Python Record Linkage Toolkit uses the jellyfish package for the Jaro, Jaro-Winkler, Levenshtein and Damerau- Levenshtein algorithms.
|
214 |
+
Options are [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’]
|
215 |
+
|
216 |
+
Comparison of some of the Jellyfish string comparison methods: https://manpages.debian.org/testing/python-jellyfish-doc/jellyfish.3.en.html '''
|
217 |
+
|
218 |
+
|
219 |
+
fuzzy_method = "jarowinkler"
|
220 |
+
|
221 |
+
# Required overall match score for all columns to count as a match
|
222 |
+
score_cut_off = 98.7 # 97.5
|
223 |
+
# I set a higher score cut off for nnet street blocking based on empirical data. Under this match value I was seeing errors. This value was (.99238), but set here to .995 to be maximally stringent. It is set in 'recordlinkage_funcs.py', score_based_match function
|
224 |
+
score_cut_off_nnet_street = 99.5 # 99.238
|
225 |
+
# If there are no numbers in the address, then the matcher needs to get a perfect score (otherwise too many issues).
|
226 |
+
no_number_fuzzy_match_limit = 100
|
227 |
+
|
228 |
+
# Reference data 'official' column names
|
229 |
+
ref_address_cols = ["Organisation", "SaoStartNumber", "SaoStartSuffix", "SaoEndNumber", "SaoEndSuffix",
|
230 |
+
"SaoText", "PaoStartNumber", "PaoStartSuffix", "PaoEndNumber",
|
231 |
+
"PaoEndSuffix", "PaoText", "Street", "PostTown", "Postcode"]
|
232 |
+
|
233 |
+
# Create a list of matching variables. Text columns will be fuzzy matched.
|
234 |
+
matching_variables = ref_address_cols
|
235 |
+
text_columns = ["Organisation", "PaoText", "Street", "PostTown", "Postcode"]
|
236 |
+
|
237 |
+
# Modify relative importance of columns (weights) for the recordlinkage part of the match. Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important
|
238 |
+
Organisation_weight = 0.1 # Organisation weight is very low just to resolve tie breakers for very similar addresses
|
239 |
+
PaoStartNumber_weight = 2
|
240 |
+
SaoStartNumber_weight = 2
|
241 |
+
Street_weight = 2
|
242 |
+
PostTown_weight = 0
|
243 |
+
Postcode_weight = 0.5
|
244 |
+
AdministrativeArea_weight = 0
|
245 |
+
# -
|
246 |
+
|
247 |
+
weight_vals = [1] * len(ref_address_cols)
|
248 |
+
weight_keys = ref_address_cols
|
249 |
+
weights = {weight_keys[i]: weight_vals[i] for i in range(len(weight_keys))}
|
250 |
+
|
251 |
+
# +
|
252 |
+
# Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important
|
253 |
+
|
254 |
+
weights["Organisation"] = Organisation_weight
|
255 |
+
weights["SaoStartNumber"] = SaoStartNumber_weight
|
256 |
+
weights["PaoStartNumber"] = PaoStartNumber_weight
|
257 |
+
weights["Street"] = Street_weight
|
258 |
+
weights["PostTown"] = PostTown_weight
|
259 |
+
weights["Postcode"] = Postcode_weight
|
260 |
+
|
261 |
+
# Creating Pydantic basemodel class
|
262 |
+
|
263 |
+
|
264 |
+
class MatcherClass(BaseModel):
|
265 |
+
# Fuzzy/general attributes
|
266 |
+
fuzzy_scorer_used: str
|
267 |
+
fuzzy_match_limit: int
|
268 |
+
fuzzy_search_addr_limit: int
|
269 |
+
filter_to_lambeth_pcodes: bool
|
270 |
+
standardise: bool
|
271 |
+
suffix_used: str
|
272 |
+
|
273 |
+
# Neural net attributes
|
274 |
+
matching_variables: List[str]
|
275 |
+
model_dir_name: str
|
276 |
+
file_step_suffix: str
|
277 |
+
exported_model: List
|
278 |
+
|
279 |
+
fuzzy_method: str
|
280 |
+
score_cut_off: float
|
281 |
+
text_columns: List[str]
|
282 |
+
weights: dict
|
283 |
+
model_type: str
|
284 |
+
labels_list: List[str]
|
285 |
+
|
286 |
+
# These are variables that are added on later
|
287 |
+
# Pytorch optional variables
|
288 |
+
word_to_index: dict
|
289 |
+
cat_to_idx: dict
|
290 |
+
device: str
|
291 |
+
vocab: List[str]
|
292 |
+
|
293 |
+
# Join data
|
294 |
+
file_name: str
|
295 |
+
ref_name: str
|
296 |
+
search_df: pd.DataFrame
|
297 |
+
excluded_df: pd.DataFrame
|
298 |
+
pre_filter_search_df: pd.DataFrame
|
299 |
+
search_address_cols: List[str]
|
300 |
+
search_postcode_col: List[str]
|
301 |
+
search_df_key_field: str
|
302 |
+
ref_df: pd.DataFrame
|
303 |
+
ref_pre_filter: pd.DataFrame
|
304 |
+
ref_address_cols: List[str]
|
305 |
+
new_join_col: List[str]
|
306 |
+
#in_joincol_list: List[str]
|
307 |
+
existing_match_cols: List[str]
|
308 |
+
standard_llpg_format: List[str]
|
309 |
+
|
310 |
+
# Results attributes
|
311 |
+
match_results_output: pd.DataFrame
|
312 |
+
predict_df_nnet: pd.DataFrame
|
313 |
+
|
314 |
+
# Other attributes generated during training
|
315 |
+
compare_all_candidates: List[str]
|
316 |
+
diag_shortlist: List[str]
|
317 |
+
diag_best_match: List[str]
|
318 |
+
|
319 |
+
results_on_orig_df: pd.DataFrame
|
320 |
+
|
321 |
+
summary: str
|
322 |
+
output_summary: str
|
323 |
+
match_outputs_name: str
|
324 |
+
results_orig_df_name: str
|
325 |
+
|
326 |
+
search_df_after_stand: pd.DataFrame
|
327 |
+
ref_df_after_stand: pd.DataFrame
|
328 |
+
search_df_after_full_stand: pd.DataFrame
|
329 |
+
ref_df_after_full_stand: pd.DataFrame
|
330 |
+
|
331 |
+
search_df_after_stand_series: pd.Series
|
332 |
+
ref_df_after_stand_series: pd.Series
|
333 |
+
search_df_after_stand_series_full_stand: pd.Series
|
334 |
+
ref_df_after_stand_series_full_stand: pd.Series
|
335 |
+
|
336 |
+
|
337 |
+
# Abort flag if the matcher couldn't even get the results of the first match
|
338 |
+
abort_flag: bool
|
339 |
+
|
340 |
+
# This is to allow for Pandas DataFrame types as an argument
|
341 |
+
class Config:
|
342 |
+
# Allow for custom types such as Pandas DataFrames in the class
|
343 |
+
arbitrary_types_allowed = True
|
344 |
+
extra = 'allow'
|
345 |
+
# Disable protected namespaces to avoid conflicts
|
346 |
+
protected_namespaces = ()
|
347 |
+
|
348 |
+
|
349 |
+
|
350 |
+
# Creating an instance of MatcherClass
|
351 |
+
InitMatch = MatcherClass(
|
352 |
+
|
353 |
+
# Fuzzy/general attributes
|
354 |
+
fuzzy_scorer_used = fuzzy_scorer_used,
|
355 |
+
fuzzy_match_limit = fuzzy_match_limit,
|
356 |
+
fuzzy_search_addr_limit = fuzzy_search_addr_limit,
|
357 |
+
filter_to_lambeth_pcodes = filter_to_lambeth_pcodes,
|
358 |
+
standardise = standardise,
|
359 |
+
suffix_used = suffix_used,
|
360 |
+
|
361 |
+
# Neural net attributes
|
362 |
+
matching_variables = matching_variables,
|
363 |
+
model_dir_name = model_dir_name,
|
364 |
+
file_step_suffix = file_step_suffix,
|
365 |
+
|
366 |
+
exported_model = [exported_model],
|
367 |
+
|
368 |
+
fuzzy_method = fuzzy_method,
|
369 |
+
score_cut_off = score_cut_off,
|
370 |
+
text_columns = text_columns,
|
371 |
+
weights = weights,
|
372 |
+
model_type = model_type,
|
373 |
+
labels_list = labels_list,
|
374 |
+
|
375 |
+
|
376 |
+
# These are variables that are added on later
|
377 |
+
# Pytorch optional variables
|
378 |
+
word_to_index = word_to_index,
|
379 |
+
cat_to_idx = cat_to_idx,
|
380 |
+
device = device,
|
381 |
+
vocab = vocab,
|
382 |
+
|
383 |
+
# Join data
|
384 |
+
file_name = '',
|
385 |
+
ref_name = '',
|
386 |
+
df_name = '',
|
387 |
+
search_df = pd.DataFrame(),
|
388 |
+
excluded_df = pd.DataFrame(),
|
389 |
+
pre_filter_search_df = pd.DataFrame(),
|
390 |
+
search_df_not_matched = pd.DataFrame(),
|
391 |
+
search_df_cleaned = pd.DataFrame(),
|
392 |
+
search_address_cols = [],
|
393 |
+
search_postcode_col = [],
|
394 |
+
search_df_key_field = 'index',
|
395 |
+
|
396 |
+
ref_df = pd.DataFrame(),
|
397 |
+
ref_df_cleaned = pd.DataFrame(),
|
398 |
+
ref_pre_filter = pd.DataFrame(),
|
399 |
+
ref_address_cols = [],
|
400 |
+
new_join_col = [],
|
401 |
+
#in_joincol_list = [],
|
402 |
+
existing_match_cols = [],
|
403 |
+
standard_llpg_format = [],
|
404 |
+
|
405 |
+
|
406 |
+
# Results attributes
|
407 |
+
match_results_output = pd.DataFrame(),
|
408 |
+
predict_df_nnet = pd.DataFrame(),
|
409 |
+
|
410 |
+
# Other attributes generated during training
|
411 |
+
compare_all_candidates = [],
|
412 |
+
diag_shortlist = [],
|
413 |
+
diag_best_match = [],
|
414 |
+
|
415 |
+
results_on_orig_df = pd.DataFrame(),
|
416 |
+
summary = "",
|
417 |
+
output_summary = "",
|
418 |
+
|
419 |
+
match_outputs_name = "",
|
420 |
+
results_orig_df_name = "",
|
421 |
+
|
422 |
+
# Post dataset preparation variables
|
423 |
+
search_df_after_stand = pd.DataFrame(),
|
424 |
+
ref_df_after_stand = pd.DataFrame(),
|
425 |
+
search_df_after_stand_series = pd.Series(),
|
426 |
+
ref_df_after_stand_series = pd.Series(),
|
427 |
+
|
428 |
+
search_df_after_full_stand = pd.DataFrame(),
|
429 |
+
ref_df_after_full_stand = pd.DataFrame(),
|
430 |
+
search_df_after_stand_series_full_stand = pd.Series(),
|
431 |
+
ref_df_after_stand_series_full_stand = pd.Series(),
|
432 |
+
|
433 |
+
# Abort flag if the matcher couldn't even get the results of the first match
|
434 |
+
abort_flag = False
|
435 |
+
)
|
tools/fuzzy_match.py
ADDED
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from typing import Dict, List, Tuple, Type
|
4 |
+
from datetime import datetime
|
5 |
+
from rapidfuzz import fuzz, process
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
PandasDataFrame = Type[pd.DataFrame]
|
9 |
+
PandasSeries = Type[pd.Series]
|
10 |
+
MatchedResults = Dict[str,Tuple[str,int]]
|
11 |
+
array = List[str]
|
12 |
+
|
13 |
+
today = datetime.now().strftime("%d%m%Y")
|
14 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
+
|
16 |
+
from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit
|
17 |
+
|
18 |
+
def string_match_array(to_match:array, choices:array,
|
19 |
+
index_name:str, matched_name:str) -> PandasDataFrame:
|
20 |
+
|
21 |
+
temp = {name: process.extractOne(name,choices)
|
22 |
+
for name in to_match}
|
23 |
+
|
24 |
+
return _create_frame(matched_results=temp, index_name=index_name,
|
25 |
+
matched_name=matched_name)
|
26 |
+
|
27 |
+
# Fuzzy match algorithm
|
28 |
+
def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):
|
29 |
+
|
30 |
+
results = []
|
31 |
+
|
32 |
+
for orig_index, orig_string in df[orig_match_address_series].items():
|
33 |
+
|
34 |
+
predict_string = df[pred_match_address_series][orig_index]
|
35 |
+
|
36 |
+
if (orig_string == '') and (predict_string == ''):
|
37 |
+
results.append(np.nan)
|
38 |
+
|
39 |
+
else:
|
40 |
+
fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
|
41 |
+
results.append(fuzz_score[0][1])
|
42 |
+
|
43 |
+
new_result_col_score = (orig_match_address_series + "_fuzz_score")
|
44 |
+
new_result_col_match = (orig_match_address_series + "_fuzz_match")
|
45 |
+
|
46 |
+
df[new_result_col_score] = results
|
47 |
+
df[new_result_col_match] = df[new_result_col_score] >= match_score
|
48 |
+
#df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
|
49 |
+
df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan
|
50 |
+
|
51 |
+
return df
|
52 |
+
|
53 |
+
def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
|
54 |
+
search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
|
55 |
+
'''
|
56 |
+
Matches by Series values; for example idx is post code and
|
57 |
+
values address. Search field is reduced by comparing same post codes address reference_address_series.
|
58 |
+
|
59 |
+
Default scorer is fuzz.Wratio. This tries to weight the different algorithms
|
60 |
+
to give the best score.
|
61 |
+
Choice of ratio type seems to make a big difference. Looking at this link:
|
62 |
+
https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
63 |
+
and this one:
|
64 |
+
https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
|
65 |
+
|
66 |
+
'''
|
67 |
+
|
68 |
+
def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:
|
69 |
+
|
70 |
+
def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):
|
71 |
+
|
72 |
+
# Create a list to store the results
|
73 |
+
results = []
|
74 |
+
|
75 |
+
# Iterate through the matched dataframe and store results in the list
|
76 |
+
for i, search_address in enumerate(search_addresses):
|
77 |
+
for j, reference_address in enumerate(reference_addresses):
|
78 |
+
score = matched[i][j]
|
79 |
+
results.append((postcode_match, search_address, reference_address, score))
|
80 |
+
|
81 |
+
# Create a dataframe from the results list
|
82 |
+
matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])
|
83 |
+
|
84 |
+
return matched_out
|
85 |
+
|
86 |
+
try:
|
87 |
+
if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
|
88 |
+
matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)
|
89 |
+
|
90 |
+
# Transform results into a dataframe
|
91 |
+
matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)
|
92 |
+
|
93 |
+
else: # 1+ addresses
|
94 |
+
matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)
|
95 |
+
|
96 |
+
# Transform results into a dataframe
|
97 |
+
matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)
|
98 |
+
|
99 |
+
# Sort the matched results by score in descending order
|
100 |
+
matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)
|
101 |
+
|
102 |
+
# Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
|
103 |
+
#matched_out = matched_out.head(search_limit)
|
104 |
+
|
105 |
+
except KeyError:
|
106 |
+
matched_out = pd.DataFrame()
|
107 |
+
|
108 |
+
return matched_out
|
109 |
+
|
110 |
+
def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:
|
111 |
+
|
112 |
+
try:
|
113 |
+
matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
|
114 |
+
return matched
|
115 |
+
except KeyError:
|
116 |
+
matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
|
117 |
+
return matched
|
118 |
+
|
119 |
+
print("Fuzzy match column length: ", len(match_address_series))
|
120 |
+
print("Fuzzy Reference column length: ", len(reference_address_series))
|
121 |
+
|
122 |
+
match_address_series = match_address_series.rename_axis('postcode_search')
|
123 |
+
match_address_df = pd.DataFrame(match_address_series.reset_index())
|
124 |
+
match_address_df['index'] = list(range(0,len(match_address_df)))
|
125 |
+
|
126 |
+
reference_address_series = reference_address_series.rename_axis('postcode_search')
|
127 |
+
reference_address_df = pd.DataFrame(reference_address_series.reset_index())
|
128 |
+
reference_address_df['index'] = list(range(0,len(reference_address_df)))
|
129 |
+
|
130 |
+
|
131 |
+
# Apply the match functions to each address
|
132 |
+
scorer = getattr(fuzz, scorer_name)
|
133 |
+
results = {}
|
134 |
+
#counter = 0
|
135 |
+
|
136 |
+
index_list = []
|
137 |
+
match_list = []
|
138 |
+
search_addresses_list = []
|
139 |
+
reference_addresses_list = []
|
140 |
+
|
141 |
+
unique_postcodes = pd.unique(match_address_df['postcode_search'])
|
142 |
+
|
143 |
+
for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
|
144 |
+
|
145 |
+
postcode_match_list = [postcode_match]
|
146 |
+
search_indexes = pd.Series()
|
147 |
+
search_addresses = pd.Series()
|
148 |
+
reference_addresses = pd.Series()
|
149 |
+
|
150 |
+
try:
|
151 |
+
search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
|
152 |
+
search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
|
153 |
+
reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]
|
154 |
+
|
155 |
+
if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
|
156 |
+
reference_addresses = pd.Series(reference_addresses)
|
157 |
+
except KeyError:
|
158 |
+
reference_addresses = pd.Series("NA")
|
159 |
+
|
160 |
+
matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)
|
161 |
+
|
162 |
+
# Write to output lists
|
163 |
+
match_list.extend([matched])
|
164 |
+
index_list.extend(search_indexes.tolist())
|
165 |
+
search_addresses_list.extend(search_addresses.tolist())
|
166 |
+
reference_addresses_list.extend(reference_addresses.tolist())
|
167 |
+
|
168 |
+
out_frame = pd.concat(match_list)
|
169 |
+
|
170 |
+
return out_frame
|
171 |
+
|
172 |
+
def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cleaned, ref_df_after_stand, fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col):
|
173 |
+
|
174 |
+
## Diagnostics
|
175 |
+
|
176 |
+
diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
|
177 |
+
matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
|
178 |
+
fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
|
179 |
+
|
180 |
+
## Fuzzy search results
|
181 |
+
|
182 |
+
match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
|
183 |
+
'full_match',
|
184 |
+
'full_number_match',
|
185 |
+
'flat_number_match',
|
186 |
+
'room_number_match',
|
187 |
+
'block_number_match',
|
188 |
+
'unit_number_match',
|
189 |
+
'property_number_match',
|
190 |
+
'close_postcode_match',
|
191 |
+
'house_court_name_match',
|
192 |
+
'fuzzy_score_match',
|
193 |
+
"fuzzy_score",
|
194 |
+
"wratio_score",
|
195 |
+
'property_number_search', 'property_number_reference',
|
196 |
+
'flat_number_search', 'flat_number_reference',
|
197 |
+
'room_number_search', 'room_number_reference',
|
198 |
+
'unit_number_search', 'unit_number_reference',
|
199 |
+
'block_number_search', 'block_number_reference',
|
200 |
+
'house_court_name_search', 'house_court_name_reference',
|
201 |
+
"search_mod_address", 'reference_mod_address','Postcode']
|
202 |
+
|
203 |
+
# Join results data onto the original housing list to create the full output
|
204 |
+
search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]
|
205 |
+
|
206 |
+
match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
|
207 |
+
diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")
|
208 |
+
|
209 |
+
match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})
|
210 |
+
|
211 |
+
# Join UPRN back onto the data from reference data
|
212 |
+
joined_ref_cols = ["fulladdress", "Reference file"]
|
213 |
+
joined_ref_cols.extend(new_join_col)
|
214 |
+
|
215 |
+
match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
|
216 |
+
|
217 |
+
# Convert long keys to string to avoid data loss
|
218 |
+
match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
|
219 |
+
match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
|
220 |
+
match_results_output["standardised_address"] = standardise
|
221 |
+
|
222 |
+
match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)
|
223 |
+
|
224 |
+
return match_results_output, diag_shortlist, diag_best_match
|
225 |
+
|
226 |
+
def create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col, fuzzy_col="fuzzy_score", search_mod_address = "search_mod_address", resolve_tie_breaks=True, no_number_fuzzy_match_limit=no_number_fuzzy_match_limit):
|
227 |
+
'''
|
228 |
+
Create a shortlist of the best matches from a list of suggested matches
|
229 |
+
'''
|
230 |
+
|
231 |
+
## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
|
232 |
+
results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)
|
233 |
+
|
234 |
+
results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)
|
235 |
+
|
236 |
+
diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]
|
237 |
+
|
238 |
+
# Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
|
239 |
+
#diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
|
240 |
+
diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True
|
241 |
+
|
242 |
+
### Count number of numbers in search string
|
243 |
+
# Using .loc
|
244 |
+
diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
|
245 |
+
diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)
|
246 |
+
|
247 |
+
|
248 |
+
# Replace fuzzy_score_match values for addresses with no numbers in them
|
249 |
+
diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
|
250 |
+
diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False
|
251 |
+
|
252 |
+
# If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
|
253 |
+
if blocker_col == "Street":
|
254 |
+
diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False
|
255 |
+
|
256 |
+
diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)
|
257 |
+
|
258 |
+
# Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
|
259 |
+
#print(diag_shortlist.columns)
|
260 |
+
diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
|
261 |
+
diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
|
262 |
+
diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
|
263 |
+
diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
|
264 |
+
diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
|
265 |
+
diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])
|
266 |
+
|
267 |
+
# Full number match is currently considered only a match between property number and flat number
|
268 |
+
|
269 |
+
diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
|
270 |
+
(diag_shortlist["flat_number_match"] == True) &\
|
271 |
+
(diag_shortlist["room_number_match"] == True) &\
|
272 |
+
(diag_shortlist["block_number_match"] == True) &\
|
273 |
+
(diag_shortlist["unit_number_match"] == True) &\
|
274 |
+
(diag_shortlist["house_court_name_match"] == True)
|
275 |
+
|
276 |
+
|
277 |
+
### Postcodes need to be close together, so all the characters should match apart from the last two
|
278 |
+
diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]
|
279 |
+
|
280 |
+
|
281 |
+
diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
|
282 |
+
(diag_shortlist['full_number_match'] == True) &\
|
283 |
+
(diag_shortlist['close_postcode_match'] == True)
|
284 |
+
|
285 |
+
diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})
|
286 |
+
|
287 |
+
### Dealing with tie breaks ##
|
288 |
+
# Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
|
289 |
+
# fuzz.WRatio
|
290 |
+
if resolve_tie_breaks == True:
|
291 |
+
def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
|
292 |
+
search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
|
293 |
+
return search_score[0][0]
|
294 |
+
|
295 |
+
diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
|
296 |
+
diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]
|
297 |
+
|
298 |
+
if not diag_shortlist_dups.empty:
|
299 |
+
diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)
|
300 |
+
|
301 |
+
diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
|
302 |
+
|
303 |
+
if 'wratio_score' not in diag_shortlist.columns:
|
304 |
+
diag_shortlist['wratio_score'] = ''
|
305 |
+
|
306 |
+
# Order by best score
|
307 |
+
diag_shortlist = diag_shortlist.sort_values([
|
308 |
+
search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
|
309 |
+
ascending = [True, False, False, False, False])
|
310 |
+
|
311 |
+
return diag_shortlist
|
312 |
+
|
313 |
+
def refine_export_results(results_df:PandasDataFrame,
|
314 |
+
matched_df:PandasDataFrame,
|
315 |
+
ref_list_df:PandasDataFrame,
|
316 |
+
matched_col="fuzzy_match_search_address",
|
317 |
+
ref_list_col="fuzzy_match_reference_address",
|
318 |
+
final_matched_address_col="search_address_stand",
|
319 |
+
final_ref_address_col="ref_address_stand",
|
320 |
+
orig_matched_address_col = "full_address",
|
321 |
+
orig_ref_address_col = "fulladdress",
|
322 |
+
fuzzy_match_limit=fuzzy_match_limit,
|
323 |
+
blocker_col="Postcode") -> PandasDataFrame:
|
324 |
+
'''
|
325 |
+
This function takes a result file from the fuzzy search, then refines the 'matched results' according
|
326 |
+
the score limit specified by the user and exports results list, matched and unmatched files.
|
327 |
+
'''
|
328 |
+
|
329 |
+
# Rename score column
|
330 |
+
results_df = results_df.rename(columns = {"score":"fuzzy_score"})
|
331 |
+
|
332 |
+
# Remove empty addresses
|
333 |
+
results_df = results_df[results_df[matched_col] !=0 ]
|
334 |
+
|
335 |
+
### Join property number and flat/room number etc. onto results_df
|
336 |
+
ref_list_df["ref_index"] = ref_list_df.index
|
337 |
+
ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
|
338 |
+
ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
|
339 |
+
|
340 |
+
results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")
|
341 |
+
|
342 |
+
|
343 |
+
### Join on relevant details from the standardised match dataframe
|
344 |
+
matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
|
345 |
+
matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
|
346 |
+
|
347 |
+
results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
|
348 |
+
|
349 |
+
# Choose your best matches from the list of options
|
350 |
+
diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
|
351 |
+
|
352 |
+
### Create matched results output ###
|
353 |
+
# Columns for the output match_results file in order
|
354 |
+
match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
|
355 |
+
'full_match',
|
356 |
+
'full_number_match',
|
357 |
+
'flat_number_match',
|
358 |
+
'room_number_match',
|
359 |
+
'block_number_match',
|
360 |
+
'unit_number_match',
|
361 |
+
'house_court_name_match',
|
362 |
+
'property_number_match',
|
363 |
+
'close_postcode_match',
|
364 |
+
'fuzzy_score_match',
|
365 |
+
"fuzzy_score",
|
366 |
+
"wratio_score",
|
367 |
+
'property_number_search', 'property_number_reference',
|
368 |
+
'flat_number_search', 'flat_number_reference',
|
369 |
+
'room_number_search', 'room_number_reference',
|
370 |
+
'block_number_search', 'block_number_reference',
|
371 |
+
'unit_number_search', 'unit_number_reference',
|
372 |
+
'house_court_name_search', 'house_court_name_reference',
|
373 |
+
"search_mod_address", 'reference_mod_address', 'postcode','Postcode']
|
374 |
+
|
375 |
+
diag_shortlist = diag_shortlist[match_results_cols]
|
376 |
+
|
377 |
+
# Choose best match from the shortlist that has been ordered according to score descending
|
378 |
+
diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
|
379 |
+
|
380 |
+
return diag_shortlist, diag_best_match
|
381 |
+
|
382 |
+
def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
|
383 |
+
'''
|
384 |
+
Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
|
385 |
+
'''
|
386 |
+
match_results_output_success = match_results_output[match_results_output["full_match"]==True]
|
387 |
+
|
388 |
+
# If you're joining to the original df on index you will need to recreate the index again
|
389 |
+
|
390 |
+
match_results_output_success = match_results_output_success.rename(columns={
|
391 |
+
"reference_orig_address":"Reference matched address",
|
392 |
+
"full_match":"Matched with reference address",
|
393 |
+
'uprn':'UPRN'
|
394 |
+
})
|
395 |
+
|
396 |
+
ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
|
397 |
+
ref_df_after_stand_cols.extend(new_join_col)
|
398 |
+
|
399 |
+
|
400 |
+
if (search_df_key_field == "index"):
|
401 |
+
# Check index is int
|
402 |
+
print("Search df key field is index")
|
403 |
+
#match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)
|
404 |
+
results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))
|
405 |
+
else:
|
406 |
+
results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))
|
407 |
+
|
408 |
+
# If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column
|
409 |
+
|
410 |
+
if "Reference matched address_y" in results_for_orig_df_join.columns:
|
411 |
+
results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)
|
412 |
+
|
413 |
+
if "Matched with reference address_y" in results_for_orig_df_join.columns:
|
414 |
+
results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
|
415 |
+
|
416 |
+
#results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)
|
417 |
+
|
418 |
+
if "Reference file_y" in results_for_orig_df_join.columns:
|
419 |
+
results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
|
420 |
+
|
421 |
+
if "UPRN_y" in results_for_orig_df_join.columns:
|
422 |
+
results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
|
423 |
+
|
424 |
+
# Drop columns that aren't useful
|
425 |
+
results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
|
426 |
+
"address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
|
427 |
+
|
428 |
+
# Replace blanks with NA, fix UPRNs
|
429 |
+
results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
|
430 |
+
|
431 |
+
results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)
|
432 |
+
|
433 |
+
# Replace cells with only 'nan' with blank
|
434 |
+
results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
|
435 |
+
|
436 |
+
|
437 |
+
return results_for_orig_df_join
|
tools/gradio.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def detect_file_type(filename):
|
5 |
+
"""Detect the file type based on its extension."""
|
6 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
7 |
+
return 'csv'
|
8 |
+
elif filename.endswith('.xlsx'):
|
9 |
+
return 'xlsx'
|
10 |
+
elif filename.endswith('.parquet'):
|
11 |
+
return 'parquet'
|
12 |
+
else:
|
13 |
+
raise ValueError("Unsupported file type.")
|
14 |
+
|
15 |
+
def read_file(filename):
|
16 |
+
"""Read the file based on its detected type."""
|
17 |
+
file_type = detect_file_type(filename)
|
18 |
+
|
19 |
+
if file_type == 'csv':
|
20 |
+
return pd.read_csv(filename, low_memory=False)
|
21 |
+
elif file_type == 'xlsx':
|
22 |
+
return pd.read_excel(filename)
|
23 |
+
elif file_type == 'parquet':
|
24 |
+
return pd.read_parquet(filename)
|
25 |
+
|
26 |
+
|
27 |
+
def initial_data_load(in_file):
|
28 |
+
new_choices = []
|
29 |
+
concat_choices = []
|
30 |
+
output_message = ""
|
31 |
+
results_df = pd.DataFrame()
|
32 |
+
df = pd.DataFrame()
|
33 |
+
|
34 |
+
file_list = [string.name for string in in_file]
|
35 |
+
|
36 |
+
data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
|
37 |
+
if data_file_names:
|
38 |
+
df = read_file(data_file_names[0])
|
39 |
+
else:
|
40 |
+
error_message = "No data file found."
|
41 |
+
return error_message, gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, results_df
|
42 |
+
|
43 |
+
results_file_names = [string for string in file_list if "results_on_orig" in string.lower()]
|
44 |
+
if results_file_names:
|
45 |
+
results_df = read_file(results_file_names[0])
|
46 |
+
|
47 |
+
new_choices = list(df.columns)
|
48 |
+
concat_choices.extend(new_choices)
|
49 |
+
|
50 |
+
output_message = "Data successfully loaded"
|
51 |
+
|
52 |
+
return output_message, gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, results_df
|
53 |
+
|
54 |
+
|
55 |
+
def dummy_function(in_colnames):
|
56 |
+
"""
|
57 |
+
A dummy function that exists just so that dropdown updates work correctly.
|
58 |
+
"""
|
59 |
+
return None
|
60 |
+
|
61 |
+
|
62 |
+
def clear_inputs(in_file, in_ref, in_text):
|
63 |
+
return gr.File.update(value=[]), gr.File.update(value=[]), gr.Textbox.update(value='')
|
tools/matcher_funcs.py
ADDED
@@ -0,0 +1,1300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from typing import Dict, List, Tuple, Type
|
6 |
+
import time
|
7 |
+
import re
|
8 |
+
import math
|
9 |
+
from datetime import datetime
|
10 |
+
import copy
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
PandasDataFrame = Type[pd.DataFrame]
|
14 |
+
PandasSeries = Type[pd.Series]
|
15 |
+
MatchedResults = Dict[str,Tuple[str,int]]
|
16 |
+
array = List[str]
|
17 |
+
|
18 |
+
today = datetime.now().strftime("%d%m%Y")
|
19 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
20 |
+
today_month_rev = datetime.now().strftime("%Y%m")
|
21 |
+
|
22 |
+
# Constants
|
23 |
+
run_fuzzy_match = True
|
24 |
+
run_nnet_match = True
|
25 |
+
run_standardise = True
|
26 |
+
|
27 |
+
from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, check_no_number_addresses, extract_street_name, remove_non_postal
|
28 |
+
from tools.standardise import standardise_wrapper_func
|
29 |
+
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
|
30 |
+
|
31 |
+
# Neural network functions
|
32 |
+
### Predict function for imported model
|
33 |
+
from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
|
34 |
+
from tools.recordlinkage_funcs import score_based_match, check_matches_against_fuzzy
|
35 |
+
from tools.gradio import initial_data_load
|
36 |
+
|
37 |
+
# API functions
|
38 |
+
from tools.addressbase_api_funcs import places_api_query
|
39 |
+
|
40 |
+
# Maximum number of neural net predictions in a single batch
|
41 |
+
from tools.constants import max_predict_len, MatcherClass
|
42 |
+
|
43 |
+
|
44 |
+
# Load in data functions
|
45 |
+
|
46 |
+
def detect_file_type(filename):
|
47 |
+
"""Detect the file type based on its extension."""
|
48 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
49 |
+
return 'csv'
|
50 |
+
elif filename.endswith('.xlsx'):
|
51 |
+
return 'xlsx'
|
52 |
+
elif filename.endswith('.parquet'):
|
53 |
+
return 'parquet'
|
54 |
+
else:
|
55 |
+
raise ValueError("Unsupported file type.")
|
56 |
+
|
57 |
+
def read_file(filename):
|
58 |
+
"""Read the file based on its detected type."""
|
59 |
+
file_type = detect_file_type(filename)
|
60 |
+
|
61 |
+
if file_type == 'csv':
|
62 |
+
return pd.read_csv(filename, low_memory=False)
|
63 |
+
elif file_type == 'xlsx':
|
64 |
+
return pd.read_excel(filename)
|
65 |
+
elif file_type == 'parquet':
|
66 |
+
return pd.read_parquet(filename)
|
67 |
+
|
68 |
+
def get_file_name(in_name):
|
69 |
+
# Corrected regex pattern
|
70 |
+
match = re.search(r'\\(?!.*\\)(.*)', in_name)
|
71 |
+
if match:
|
72 |
+
matched_result = match.group(1)
|
73 |
+
else:
|
74 |
+
matched_result = None
|
75 |
+
|
76 |
+
return matched_result
|
77 |
+
|
78 |
+
def filter_not_matched(
|
79 |
+
matched_results: pd.DataFrame,
|
80 |
+
search_df: pd.DataFrame,
|
81 |
+
key_col: str
|
82 |
+
) -> pd.DataFrame:
|
83 |
+
"""Filters search_df to only rows with key_col not in matched_results"""
|
84 |
+
|
85 |
+
# Validate inputs
|
86 |
+
if not isinstance(matched_results, pd.DataFrame):
|
87 |
+
raise TypeError("not_matched_results must be a Pandas DataFrame")
|
88 |
+
|
89 |
+
if not isinstance(search_df, pd.DataFrame):
|
90 |
+
raise TypeError("search_df must be a Pandas DataFrame")
|
91 |
+
|
92 |
+
if not isinstance(key_col, str):
|
93 |
+
raise TypeError("key_col must be a string")
|
94 |
+
|
95 |
+
if key_col not in matched_results.columns:
|
96 |
+
raise ValueError(f"{key_col} not a column in matched_results")
|
97 |
+
|
98 |
+
matched_results_success = matched_results[matched_results["full_match"]==True]
|
99 |
+
|
100 |
+
# Filter search_df
|
101 |
+
#print(search_df.columns)
|
102 |
+
#print(key_col)
|
103 |
+
|
104 |
+
matched = search_df[key_col].astype(str).isin(matched_results_success[key_col].astype(str))#.drop(['level_0','index'], axis = 1, errors="ignore").reset_index() #
|
105 |
+
|
106 |
+
return search_df.iloc[np.where(~matched)[0]] # search_df[~matched]
|
107 |
+
|
108 |
+
def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
|
109 |
+
if in_api_key == "":
|
110 |
+
print ("No API key provided, please provide one to continue")
|
111 |
+
return Matcher
|
112 |
+
else:
|
113 |
+
# Call the API
|
114 |
+
#Matcher.ref_df = pd.DataFrame()
|
115 |
+
|
116 |
+
# Check if the ref_df file already exists
|
117 |
+
def check_and_create_api_folder():
|
118 |
+
# Check if the environmental variable is available
|
119 |
+
file_path = os.environ.get('ADDRESSBASE_API_OUT') # Replace 'YOUR_ENV_VARIABLE_NAME' with the name of your environmental variable
|
120 |
+
|
121 |
+
if file_path is None:
|
122 |
+
# Environmental variable is not set
|
123 |
+
print("API output environmental variable not set.")
|
124 |
+
# Create the 'api/' folder if it doesn't already exist
|
125 |
+
api_folder_path = 'api/'
|
126 |
+
if not os.path.exists(api_folder_path):
|
127 |
+
os.makedirs(api_folder_path)
|
128 |
+
print(f"'{api_folder_path}' folder created.")
|
129 |
+
else:
|
130 |
+
# Environmental variable is set
|
131 |
+
api_folder_path = file_path
|
132 |
+
print(f"Environmental variable found: {api_folder_path}")
|
133 |
+
|
134 |
+
return api_folder_path
|
135 |
+
|
136 |
+
api_output_folder = check_and_create_api_folder()
|
137 |
+
|
138 |
+
# Check if the file exists
|
139 |
+
print("Matcher file name: ", Matcher.file_name)
|
140 |
+
search_file_name_without_extension = re.sub(r'\.[^.]+$', '', Matcher.file_name)
|
141 |
+
#print("Search file name without extension: ", search_file_name_without_extension)
|
142 |
+
api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
|
143 |
+
print("API reference save location: ", api_ref_save_loc)
|
144 |
+
|
145 |
+
# Allow for csv, parquet and gzipped csv files
|
146 |
+
if os.path.isfile(api_ref_save_loc + ".csv"):
|
147 |
+
print("API reference CSV file found")
|
148 |
+
Matcher.ref_df = pd.read_csv(api_ref_save_loc + ".csv")
|
149 |
+
elif os.path.isfile(api_ref_save_loc + ".parquet"):
|
150 |
+
print("API reference Parquet file found")
|
151 |
+
Matcher.ref_df = pd.read_parquet(api_ref_save_loc + ".parquet")
|
152 |
+
elif os.path.isfile(api_ref_save_loc + ".csv.gz"):
|
153 |
+
print("API reference gzipped CSV file found")
|
154 |
+
Matcher.ref_df = pd.read_csv(api_ref_save_loc + ".csv.gz", compression='gzip')
|
155 |
+
else:
|
156 |
+
print("API reference file not found, querying API for reference data.")
|
157 |
+
|
158 |
+
|
159 |
+
def conduct_api_loop(in_query, in_api_key, query_type, i, api_ref_save_loc, loop_list, api_search_index):
|
160 |
+
ref_addresses = places_api_query(in_query, in_api_key, query_type)
|
161 |
+
|
162 |
+
ref_addresses['Address_row_number'] = api_search_index[i]
|
163 |
+
|
164 |
+
loop_list.append(ref_addresses)
|
165 |
+
|
166 |
+
if (i + 1) % 500 == 0:
|
167 |
+
print("Saving api call checkpoint for query:", str(i + 1))
|
168 |
+
|
169 |
+
pd.concat(loop_list).to_parquet(api_ref_save_loc + ".parquet", index=False)
|
170 |
+
|
171 |
+
return loop_list
|
172 |
+
|
173 |
+
def check_postcode(postcode):
|
174 |
+
# Remove spaces on the ends or in the middle of the postcode, and any symbols
|
175 |
+
cleaned_postcode = re.sub(r'[^\w\s]|[\s]', '', postcode)
|
176 |
+
# Ensure that the postcode meets the specified format
|
177 |
+
postcode_pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?[0-9][A-Z]{2}|GIR0AA|GIR0A{2}|[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?[0-9]{1}?)\b'
|
178 |
+
match = re.match(postcode_pattern, cleaned_postcode)
|
179 |
+
if match and len(cleaned_postcode) in (6, 7):
|
180 |
+
return cleaned_postcode # Return the matched postcode string
|
181 |
+
else:
|
182 |
+
return None # Return None if no match is found
|
183 |
+
|
184 |
+
if query_type == "Address":
|
185 |
+
save_file = True
|
186 |
+
# Do an API call for each unique address
|
187 |
+
|
188 |
+
if not Matcher.ref_df.empty:
|
189 |
+
api_search_df = Matcher.search_df.copy().drop(list(set(Matcher.ref_df["Address_row_number"])))
|
190 |
+
|
191 |
+
else:
|
192 |
+
print("Matcher ref_df data empty")
|
193 |
+
api_search_df = Matcher.search_df.copy()
|
194 |
+
|
195 |
+
i = 0
|
196 |
+
loop_df = Matcher.ref_df
|
197 |
+
loop_list = [Matcher.ref_df]
|
198 |
+
|
199 |
+
for address in progress.tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
|
200 |
+
print("Query number: " + str(i+1), "with address: ", address)
|
201 |
+
|
202 |
+
api_search_index = api_search_df.index
|
203 |
+
|
204 |
+
loop_list = conduct_api_loop(address, in_api_key, query_type, i, api_ref_save_loc, loop_list, api_search_index)
|
205 |
+
|
206 |
+
i += 1
|
207 |
+
|
208 |
+
loop_df = pd.concat(loop_list)
|
209 |
+
Matcher.ref_df = loop_df.drop_duplicates(keep='first', ignore_index=True)
|
210 |
+
|
211 |
+
|
212 |
+
elif query_type == "Postcode":
|
213 |
+
save_file = True
|
214 |
+
# Do an API call for each unique postcode. Each API call can only return 100 results maximum :/
|
215 |
+
|
216 |
+
if not Matcher.ref_df.empty:
|
217 |
+
print("Excluding postcodes that already exist in API call data.")
|
218 |
+
|
219 |
+
# Retain original index values after filtering
|
220 |
+
Matcher.search_df["index_keep"] = Matcher.search_df.index
|
221 |
+
|
222 |
+
if 'invalid_request' in Matcher.ref_df.columns and 'Address_row_number' in Matcher.ref_df.columns:
|
223 |
+
print("Joining on invalid_request column")
|
224 |
+
Matcher.search_df = Matcher.search_df.merge(Matcher.ref_df[['Address_row_number', 'invalid_request']].drop_duplicates(subset="Address_row_number"), left_on = Matcher.search_df_key_field, right_on='Address_row_number', how='left')
|
225 |
+
|
226 |
+
elif not 'invalid_request' in Matcher.search_df.columns:
|
227 |
+
Matcher.search_df['invalid_request'] = False
|
228 |
+
|
229 |
+
postcode_col = Matcher.search_postcode_col[0]
|
230 |
+
|
231 |
+
# Check ref_df df against cleaned and non-cleaned postcodes
|
232 |
+
Matcher.search_df[postcode_col] = Matcher.search_df[postcode_col].astype(str)
|
233 |
+
search_df_cleaned_pcodes = Matcher.search_df[postcode_col].apply(check_postcode)
|
234 |
+
ref_df_cleaned_pcodes = Matcher.ref_df['POSTCODE_LOCATOR'].dropna().apply(check_postcode)
|
235 |
+
|
236 |
+
api_search_df = Matcher.search_df.copy().loc[
|
237 |
+
~Matcher.search_df[postcode_col].isin(Matcher.ref_df['POSTCODE_LOCATOR']) &
|
238 |
+
~(Matcher.search_df['invalid_request']==True) &
|
239 |
+
~(search_df_cleaned_pcodes.isin(ref_df_cleaned_pcodes)), :]
|
240 |
+
|
241 |
+
#api_search_index = api_search_df["index_keep"]
|
242 |
+
#api_search_df.index = api_search_index
|
243 |
+
|
244 |
+
print("Remaining invalid request count: ", Matcher.search_df['invalid_request'].value_counts())
|
245 |
+
|
246 |
+
else:
|
247 |
+
print("Matcher ref_df data empty")
|
248 |
+
api_search_df = Matcher.search_df.copy()
|
249 |
+
api_search_index = api_search_df.index
|
250 |
+
api_search_df['index_keep'] = api_search_index
|
251 |
+
|
252 |
+
postcode_col = Matcher.search_postcode_col[0]
|
253 |
+
|
254 |
+
unique_pcodes = api_search_df.loc[:, ["index_keep", postcode_col]].drop_duplicates(subset=[postcode_col], keep='first')
|
255 |
+
print("Unique postcodes: ", unique_pcodes[postcode_col])
|
256 |
+
|
257 |
+
# Apply the function to each postcode in the Series
|
258 |
+
unique_pcodes["cleaned_unique_postcodes"] = unique_pcodes[postcode_col].apply(check_postcode)
|
259 |
+
|
260 |
+
# Filter out the postcodes that comply with the specified format
|
261 |
+
valid_unique_postcodes = unique_pcodes.dropna(subset=["cleaned_unique_postcodes"])
|
262 |
+
|
263 |
+
valid_postcode_search_index = valid_unique_postcodes['index_keep']
|
264 |
+
valid_postcode_search_index_list = valid_postcode_search_index.tolist()
|
265 |
+
|
266 |
+
if not valid_unique_postcodes.empty:
|
267 |
+
|
268 |
+
print("Unique valid postcodes: ", valid_unique_postcodes)
|
269 |
+
print("Number of unique valid postcodes: ", len(valid_unique_postcodes))
|
270 |
+
|
271 |
+
tic = time.perf_counter()
|
272 |
+
|
273 |
+
i = 0
|
274 |
+
loop_df = Matcher.ref_df
|
275 |
+
loop_list = [Matcher.ref_df]
|
276 |
+
|
277 |
+
for pcode in progress.tqdm(valid_unique_postcodes["cleaned_unique_postcodes"], desc= "Making API calls", unit="unique postcodes", total=len(valid_unique_postcodes["cleaned_unique_postcodes"])):
|
278 |
+
#api_search_index = api_search_df.index
|
279 |
+
|
280 |
+
print("Query number: " + str(i+1), " with postcode: ", pcode, " and index: ", valid_postcode_search_index_list[i])
|
281 |
+
|
282 |
+
loop_list = conduct_api_loop(pcode, in_api_key, query_type, i, api_ref_save_loc, loop_list, valid_postcode_search_index_list)
|
283 |
+
|
284 |
+
i += 1
|
285 |
+
|
286 |
+
loop_df = pd.concat(loop_list)
|
287 |
+
Matcher.ref_df = loop_df.drop_duplicates(keep='first', ignore_index=True)
|
288 |
+
|
289 |
+
toc = time.perf_counter()
|
290 |
+
print("API call time in seconds: ", toc-tic)
|
291 |
+
else:
|
292 |
+
print("No valid postcodes found.")
|
293 |
+
|
294 |
+
elif query_type == "UPRN":
|
295 |
+
save_file = True
|
296 |
+
# Do an API call for each unique address
|
297 |
+
|
298 |
+
if not Matcher.ref_df.empty:
|
299 |
+
api_search_df = Matcher.search_df.copy().drop(list(set(Matcher.ref_df["Address_row_number"])))
|
300 |
+
|
301 |
+
else:
|
302 |
+
print("Matcher ref_df data empty")
|
303 |
+
api_search_df = Matcher.search_df.copy()
|
304 |
+
|
305 |
+
i = 0
|
306 |
+
loop_df = Matcher.ref_df
|
307 |
+
loop_list = [Matcher.ref_df]
|
308 |
+
uprn_check_col = 'ADR_UPRN'
|
309 |
+
|
310 |
+
for uprn in progress.tqdm(api_search_df[uprn_check_col], desc= "Making API calls", unit="UPRNs", total=len(api_search_df[uprn_check_col])):
|
311 |
+
print("Query number: " + str(i+1), "with address: ", uprn)
|
312 |
+
|
313 |
+
api_search_index = api_search_df.index
|
314 |
+
|
315 |
+
loop_list = conduct_api_loop(uprn, in_api_key, query_type, i, api_ref_save_loc, loop_list, api_search_index)
|
316 |
+
|
317 |
+
i += 1
|
318 |
+
|
319 |
+
loop_df = pd.concat(loop_list)
|
320 |
+
Matcher.ref_df = loop_df.drop_duplicates(keep='first', ignore_index=True)
|
321 |
+
|
322 |
+
else:
|
323 |
+
print("Reference file loaded from file, no API calls made.")
|
324 |
+
save_file = False
|
325 |
+
|
326 |
+
# Post API call processing
|
327 |
+
|
328 |
+
Matcher.ref_name = "API"
|
329 |
+
#Matcher.ref_df = Matcher.ref_df.reset_index(drop=True)
|
330 |
+
Matcher.ref_df['Reference file'] = Matcher.ref_name
|
331 |
+
|
332 |
+
if query_type == "Postcode":
|
333 |
+
#print(Matcher.ref_df.columns)
|
334 |
+
|
335 |
+
cols_of_interest = ["ADDRESS", "ORGANISATION", "SAO_TEXT", "SAO_START_NUMBER", "SAO_START_SUFFIX", "SAO_END_NUMBER", "SAO_END_SUFFIX", "PAO_TEXT", "PAO_START_NUMBER", "PAO_START_SUFFIX", "PAO_END_NUMBER", "PAO_END_SUFFIX", "STREET_DESCRIPTION", "TOWN_NAME" ,"ADMINISTRATIVE_AREA", "LOCALITY_NAME", "POSTCODE_LOCATOR", "UPRN", "PARENT_UPRN", "USRN", "LPI_KEY", "RPC", "LOGICAL_STATUS_CODE", "CLASSIFICATION_CODE", "LOCAL_CUSTODIAN_CODE", "COUNTRY_CODE", "POSTAL_ADDRESS_CODE", "BLPU_STATE_CODE", "LAST_UPDATE_DATE", "ENTRY_DATE", "STREET_STATE_CODE", "STREET_CLASSIFICATION_CODE", "LPI_LOGICAL_STATUS_CODE", "invalid_request", "Address_row_number", "Reference file"]
|
336 |
+
|
337 |
+
try:
|
338 |
+
# Attempt to select only the columns of interest
|
339 |
+
Matcher.ref_df = Matcher.ref_df[cols_of_interest]
|
340 |
+
except KeyError as e:
|
341 |
+
missing_columns = [col for col in e.args[0][1:-1].split(", ") if col not in cols_of_interest]
|
342 |
+
# Handle the missing columns gracefully
|
343 |
+
print(f"Some columns are missing: {missing_columns}")
|
344 |
+
|
345 |
+
#if "LOCAL_CUSTODIAN_CODE" in Matcher.ref_df.columns:
|
346 |
+
# These are items that are 'owned' by Ordnance Survey like telephone boxes, bus shelters
|
347 |
+
# Matcher.ref_df = Matcher.ref_df.loc[Matcher.ref_df["LOCAL_CUSTODIAN_CODE"] != 7655,:]
|
348 |
+
|
349 |
+
if save_file:
|
350 |
+
print("Saving reference file to: " + api_ref_save_loc[:-5] + ".parquet")
|
351 |
+
Matcher.ref_df.to_parquet(api_ref_save_loc + ".parquet", index=False) # Save checkpoint as well
|
352 |
+
Matcher.ref_df.to_parquet(api_ref_save_loc[:-5] + ".parquet", index=False)
|
353 |
+
|
354 |
+
if Matcher.ref_df.empty:
|
355 |
+
print ("No reference data found with API")
|
356 |
+
return Matcher
|
357 |
+
|
358 |
+
return Matcher
|
359 |
+
|
360 |
+
def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
|
361 |
+
'''
|
362 |
+
Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
|
363 |
+
'''
|
364 |
+
|
365 |
+
# Check if reference data loaded, bring in if already there
|
366 |
+
if not ref_data_state.empty:
|
367 |
+
Matcher.ref_df = ref_data_state
|
368 |
+
Matcher.ref_name = get_file_name(in_ref[0].name)
|
369 |
+
Matcher.ref_df["Reference file"] = Matcher.ref_name
|
370 |
+
|
371 |
+
# Otherwise check for file name and load in. If nothing found, fail
|
372 |
+
else:
|
373 |
+
Matcher.ref_df = pd.DataFrame()
|
374 |
+
|
375 |
+
if not in_ref:
|
376 |
+
if in_api==False:
|
377 |
+
print ("No reference file provided, please provide one to continue")
|
378 |
+
return Matcher
|
379 |
+
# Check if api call required and api key is provided
|
380 |
+
else:
|
381 |
+
Matcher = run_all_api_calls(in_api_key, Matcher, query_type)
|
382 |
+
|
383 |
+
else:
|
384 |
+
Matcher.ref_name = get_file_name(in_ref[0].name)
|
385 |
+
|
386 |
+
# Concatenate all in reference files together
|
387 |
+
for ref_file in in_ref:
|
388 |
+
#print(ref_file.name)
|
389 |
+
temp_ref_file = read_file(ref_file.name)
|
390 |
+
|
391 |
+
file_name_out = get_file_name(ref_file.name)
|
392 |
+
temp_ref_file["Reference file"] = file_name_out
|
393 |
+
|
394 |
+
Matcher.ref_df = pd.concat([Matcher.ref_df, temp_ref_file])
|
395 |
+
|
396 |
+
# For the neural net model to work, the llpg columns have to be in the LPI format (e.g. with columns SaoText, SaoStartNumber etc. Here we check if we have that format.
|
397 |
+
|
398 |
+
if 'Address_LPI' in Matcher.ref_df.columns:
|
399 |
+
Matcher.ref_df = Matcher.ref_df.rename(columns={
|
400 |
+
"Name_LPI": "PaoText",
|
401 |
+
"Num_LPI": "PaoStartNumber",
|
402 |
+
"Num_Suffix_LPI":"PaoStartSuffix",
|
403 |
+
"Number End_LPI":"PaoEndNumber",
|
404 |
+
"Number_End_Suffix_LPI":"PaoEndSuffix",
|
405 |
+
|
406 |
+
"Secondary_Name_LPI":"SaoText",
|
407 |
+
"Secondary_Num_LPI":"SaoStartNumber",
|
408 |
+
"Secondary_Num_Suffix_LPI":"SaoStartSuffix",
|
409 |
+
"Secondary_Num_End_LPI":"SaoEndNumber",
|
410 |
+
"Secondary_Num_End_Suffix_LPI":"SaoEndSuffix",
|
411 |
+
"Postcode_LPI":"Postcode",
|
412 |
+
"Postal_Town_LPI":"PostTown",
|
413 |
+
"UPRN_BLPU": "UPRN"
|
414 |
+
})
|
415 |
+
|
416 |
+
#print("Matcher reference file: ", Matcher.ref_df['Reference file'])
|
417 |
+
|
418 |
+
# Check if the source is the Addressbase places API
|
419 |
+
if Matcher.ref_df.iloc[0]['Reference file'] == 'API' or '_api_' in Matcher.ref_df.iloc[0]['Reference file']:
|
420 |
+
Matcher.ref_df = Matcher.ref_df.rename(columns={
|
421 |
+
"ORGANISATION_NAME": "Organisation",
|
422 |
+
"ORGANISATION": "Organisation",
|
423 |
+
"PAO_TEXT": "PaoText",
|
424 |
+
"PAO_START_NUMBER": "PaoStartNumber",
|
425 |
+
"PAO_START_SUFFIX":"PaoStartSuffix",
|
426 |
+
"PAO_END_NUMBER":"PaoEndNumber",
|
427 |
+
"PAO_END_SUFFIX":"PaoEndSuffix",
|
428 |
+
"STREET_DESCRIPTION":"Street",
|
429 |
+
|
430 |
+
"SAO_TEXT":"SaoText",
|
431 |
+
"SAO_START_NUMBER":"SaoStartNumber",
|
432 |
+
"SAO_START_SUFFIX":"SaoStartSuffix",
|
433 |
+
"SAO_END_NUMBER":"SaoEndNumber",
|
434 |
+
"SAO_END_SUFFIX":"SaoEndSuffix",
|
435 |
+
|
436 |
+
"POSTCODE_LOCATOR":"Postcode",
|
437 |
+
"TOWN_NAME":"PostTown",
|
438 |
+
"LOCALITY_NAME":"LocalityName",
|
439 |
+
"ADMINISTRATIVE_AREA":"AdministrativeArea"
|
440 |
+
}, errors="ignore")
|
441 |
+
|
442 |
+
# Check ref_df file format
|
443 |
+
# If standard format, or it's an API call
|
444 |
+
if 'SaoText' in Matcher.ref_df.columns or in_api:
|
445 |
+
Matcher.standard_llpg_format = True
|
446 |
+
Matcher.ref_address_cols = ["Organisation", "SaoStartNumber", "SaoStartSuffix", "SaoEndNumber", "SaoEndSuffix", "SaoText", "PaoStartNumber", "PaoStartSuffix", "PaoEndNumber",
|
447 |
+
"PaoEndSuffix", "PaoText", "Street", "PostTown", "Postcode"]
|
448 |
+
# Add columns from the list if they don't exist
|
449 |
+
for col in Matcher.ref_address_cols:
|
450 |
+
if col not in Matcher.ref_df:
|
451 |
+
Matcher.ref_df[col] = np.nan
|
452 |
+
else:
|
453 |
+
Matcher.standard_llpg_format = False
|
454 |
+
Matcher.ref_address_cols = in_refcol
|
455 |
+
Matcher.ref_df = Matcher.ref_df.rename(columns={Matcher.ref_address_cols[-1]:"Postcode"})
|
456 |
+
Matcher.ref_address_cols[-1] = "Postcode"
|
457 |
+
|
458 |
+
|
459 |
+
# Reset index for ref_df as multiple files may have been combined with identical indices
|
460 |
+
Matcher.ref_df = Matcher.ref_df.reset_index() #.drop(["index","level_0"], axis = 1, errors="ignore").reset_index().drop(["index","level_0"], axis = 1, errors="ignore")
|
461 |
+
Matcher.ref_df.index.name = 'index'
|
462 |
+
|
463 |
+
return Matcher
|
464 |
+
|
465 |
+
def check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api):
|
466 |
+
# Assign join field if not known
|
467 |
+
if not Matcher.search_df_key_field:
|
468 |
+
Matcher.search_df_key_field = "index"
|
469 |
+
|
470 |
+
# Set search address cols as entered column names
|
471 |
+
#print("In colnames in check match data: ", in_colnames)
|
472 |
+
Matcher.search_address_cols = in_colnames
|
473 |
+
|
474 |
+
# Check if data loaded already and bring it in
|
475 |
+
if not data_state.empty:
|
476 |
+
|
477 |
+
Matcher.search_df = data_state
|
478 |
+
|
479 |
+
|
480 |
+
|
481 |
+
Matcher.search_df['index'] = Matcher.search_df.index
|
482 |
+
|
483 |
+
else:
|
484 |
+
Matcher.search_df = pd.DataFrame()
|
485 |
+
|
486 |
+
# If someone has just entered open text, just load this instead
|
487 |
+
if in_text:
|
488 |
+
Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
|
489 |
+
|
490 |
+
# If two matcher files are loaded in, the algorithm will combine them together
|
491 |
+
if Matcher.search_df.empty and in_file:
|
492 |
+
output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
|
493 |
+
|
494 |
+
file_list = [string.name for string in in_file]
|
495 |
+
data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
|
496 |
+
|
497 |
+
#print("Data file names: ", data_file_names)
|
498 |
+
Matcher.file_name = get_file_name(data_file_names[0])
|
499 |
+
|
500 |
+
# search_df makes column to use as index
|
501 |
+
Matcher.search_df['index'] = Matcher.search_df.index
|
502 |
+
|
503 |
+
|
504 |
+
# Join previously created results file onto search_df if previous results file exists
|
505 |
+
if not results_data_state.empty:
|
506 |
+
|
507 |
+
print("Joining on previous results file")
|
508 |
+
Matcher.results_on_orig_df = results_data_state.copy()
|
509 |
+
Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
|
510 |
+
|
511 |
+
# If no join on column suggested, assume the user wants the UPRN
|
512 |
+
# print("in_joincol: ", in_joincol)
|
513 |
+
|
514 |
+
if not in_joincol:
|
515 |
+
Matcher.new_join_col = ['UPRN']
|
516 |
+
#Matcher.new_join_col = Matcher.new_join_col#[0]
|
517 |
+
|
518 |
+
else:
|
519 |
+
Matcher.new_join_col = in_joincol
|
520 |
+
#Matcher.new_join_col = Matcher.new_join_col
|
521 |
+
|
522 |
+
# Extract the column names from the input data
|
523 |
+
print("In colnames: ", in_colnames)
|
524 |
+
|
525 |
+
if len(in_colnames) > 1:
|
526 |
+
Matcher.search_postcode_col = [in_colnames[-1]]
|
527 |
+
|
528 |
+
print("Postcode col: ", Matcher.search_postcode_col)
|
529 |
+
|
530 |
+
elif len(in_colnames) == 1:
|
531 |
+
Matcher.search_df['full_address_postcode'] = Matcher.search_df[in_colnames[0]]
|
532 |
+
Matcher.search_postcode_col = ['full_address_postcode']
|
533 |
+
Matcher.search_address_cols.append('full_address_postcode')
|
534 |
+
|
535 |
+
# Check for column that indicates there are existing matches. The code will then search this column for entries, and will remove them from the data to be searched
|
536 |
+
Matcher.existing_match_cols = in_existing
|
537 |
+
|
538 |
+
if in_existing:
|
539 |
+
if "Matched with reference address" in Matcher.search_df.columns:
|
540 |
+
Matcher.search_df.loc[~Matcher.search_df[in_existing].isna(), "Matched with reference address"] = True
|
541 |
+
else: Matcher.search_df["Matched with reference address"] = ~Matcher.search_df[in_existing].isna()
|
542 |
+
|
543 |
+
print("Shape of search_df before filtering is: ", Matcher.search_df.shape)
|
544 |
+
|
545 |
+
### Filter addresses to those with length > 0
|
546 |
+
zero_length_search_df = Matcher.search_df.copy()[Matcher.search_address_cols]
|
547 |
+
zero_length_search_df = zero_length_search_df.fillna('').infer_objects(copy=False)
|
548 |
+
Matcher.search_df["address_cols_joined"] = zero_length_search_df.astype(str).sum(axis=1).str.strip()
|
549 |
+
|
550 |
+
length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
|
551 |
+
|
552 |
+
|
553 |
+
### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
|
554 |
+
if not in_api:
|
555 |
+
if Matcher.filter_to_lambeth_pcodes == True:
|
556 |
+
Matcher.search_df["postcode_search_area"] = Matcher.search_df[Matcher.search_postcode_col[0]].str.strip().str.upper().str.replace(" ", "").str[:-2]
|
557 |
+
Matcher.ref_df["postcode_search_area"] = Matcher.ref_df["Postcode"].str.strip().str.upper().str.replace(" ", "").str[:-2]
|
558 |
+
|
559 |
+
unique_ref_pcode_area = (Matcher.ref_df["postcode_search_area"][Matcher.ref_df["postcode_search_area"].str.len() > 3]).unique()
|
560 |
+
postcode_found_in_search = Matcher.search_df["postcode_search_area"].isin(unique_ref_pcode_area)
|
561 |
+
|
562 |
+
Matcher.search_df["Excluded from search"] = "Included in search"
|
563 |
+
Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
|
564 |
+
Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
|
565 |
+
Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
|
566 |
+
Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
|
567 |
+
|
568 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
|
569 |
+
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
|
570 |
+
|
571 |
+
|
572 |
+
# Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
|
573 |
+
if "Matched with reference address" in Matcher.search_df.columns:
|
574 |
+
previously_matched = Matcher.pre_filter_search_df["Matched with reference address"] == True
|
575 |
+
Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
|
576 |
+
|
577 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0) | (previously_matched)]
|
578 |
+
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0) & ~(previously_matched)]
|
579 |
+
|
580 |
+
else:
|
581 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
|
582 |
+
Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
|
583 |
+
|
584 |
+
print("Shape of ref_df before filtering is: ", Matcher.ref_df.shape)
|
585 |
+
|
586 |
+
unique_search_pcode_area = (Matcher.search_df["postcode_search_area"]).unique()
|
587 |
+
postcode_found_in_ref = Matcher.ref_df["postcode_search_area"].isin(unique_search_pcode_area)
|
588 |
+
Matcher.ref_df = Matcher.ref_df[postcode_found_in_ref]
|
589 |
+
|
590 |
+
Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("postcode_search_area", axis = 1)
|
591 |
+
Matcher.search_df = Matcher.search_df.drop("postcode_search_area", axis = 1)
|
592 |
+
Matcher.ref_df = Matcher.ref_df.drop("postcode_search_area", axis = 1)
|
593 |
+
Matcher.excluded_df = Matcher.excluded_df.drop("postcode_search_area", axis = 1)
|
594 |
+
else:
|
595 |
+
Matcher.pre_filter_search_df = Matcher.search_df.copy()
|
596 |
+
Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
|
597 |
+
|
598 |
+
Matcher.excluded_df = Matcher.search_df[~(length_more_than_0)]
|
599 |
+
Matcher.search_df = Matcher.search_df[length_more_than_0]
|
600 |
+
|
601 |
+
|
602 |
+
Matcher.search_df = Matcher.search_df.drop("address_cols_joined", axis = 1, errors="ignore")
|
603 |
+
Matcher.excluded_df = Matcher.excluded_df.drop("address_cols_joined", axis = 1, errors="ignore")
|
604 |
+
|
605 |
+
Matcher.search_df_not_matched = Matcher.search_df
|
606 |
+
|
607 |
+
|
608 |
+
# If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
|
609 |
+
if in_api:
|
610 |
+
|
611 |
+
if in_file:
|
612 |
+
output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
|
613 |
+
|
614 |
+
file_list = [string.name for string in in_file]
|
615 |
+
data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
|
616 |
+
|
617 |
+
Matcher.file_name = get_file_name(data_file_names[0])
|
618 |
+
|
619 |
+
else:
|
620 |
+
if in_text:
|
621 |
+
Matcher.file_name = in_text
|
622 |
+
else:
|
623 |
+
Matcher.file_name = "API call"
|
624 |
+
|
625 |
+
# Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
|
626 |
+
if in_existing:
|
627 |
+
print("Checking for previously matched records")
|
628 |
+
Matcher.pre_filter_search_df = Matcher.search_df.copy()
|
629 |
+
previously_matched = ~Matcher.pre_filter_search_df[in_existing].isnull()
|
630 |
+
Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
|
631 |
+
|
632 |
+
Matcher.excluded_df = Matcher.search_df.copy()[~(length_more_than_0) | (previously_matched)]
|
633 |
+
Matcher.search_df = Matcher.search_df[(length_more_than_0) & ~(previously_matched)]
|
634 |
+
|
635 |
+
if type(Matcher.search_df) == str: search_df_cleaned, search_df_key_field, search_address_cols = prepare_search_address_string(Matcher.search_df)
|
636 |
+
else: search_df_cleaned = prepare_search_address(Matcher.search_df, Matcher.search_address_cols, Matcher.search_postcode_col, Matcher.search_df_key_field)
|
637 |
+
|
638 |
+
|
639 |
+
Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
|
640 |
+
#Matcher.search_df = Matcher.search_df.reset_index(drop=True)
|
641 |
+
#Matcher.search_df.index.name = 'index'
|
642 |
+
|
643 |
+
return Matcher
|
644 |
+
|
645 |
+
def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
|
646 |
+
'''
|
647 |
+
Load in user inputs from the Gradio interface. Convert all input types (single address, or csv input) into standardised data format that can be used downstream for the fuzzy matching.
|
648 |
+
'''
|
649 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
650 |
+
|
651 |
+
# Abort flag for if it's not even possible to attempt the first stage of the match for some reason
|
652 |
+
Matcher.abort_flag = False
|
653 |
+
|
654 |
+
### ref_df FILES ###
|
655 |
+
# If not an API call, run this first
|
656 |
+
if not in_api:
|
657 |
+
Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
658 |
+
|
659 |
+
### MATCH/SEARCH FILES ###
|
660 |
+
# If doing API calls, we need to know the search data before querying for specific addresses/postcodes
|
661 |
+
Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
|
662 |
+
|
663 |
+
|
664 |
+
# If an API call, ref_df data is loaded after
|
665 |
+
if in_api:
|
666 |
+
Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
667 |
+
|
668 |
+
#print("Resetting index.")
|
669 |
+
# API-called data will often have duplicate indexes in it - drop them to avoid conflicts down the line
|
670 |
+
#Matcher.ref_df = Matcher.ref_df.reset_index(drop = True)
|
671 |
+
|
672 |
+
print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
|
673 |
+
print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
|
674 |
+
|
675 |
+
Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
|
676 |
+
Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
|
677 |
+
|
678 |
+
#Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
679 |
+
#Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
680 |
+
|
681 |
+
return Matcher
|
682 |
+
|
683 |
+
# DF preparation functions
|
684 |
+
|
685 |
+
# Run batch of matches
|
686 |
+
def run_match_batch(InitialMatch, batch_n, total_batches, progress=gr.Progress()):
|
687 |
+
if run_fuzzy_match == True:
|
688 |
+
|
689 |
+
overall_tic = time.perf_counter()
|
690 |
+
|
691 |
+
progress(0, desc= "Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Fuzzy match - non-standardised dataset")
|
692 |
+
df_name = "Fuzzy not standardised"
|
693 |
+
|
694 |
+
''' FUZZY MATCHING '''
|
695 |
+
|
696 |
+
''' Run fuzzy match on non-standardised dataset '''
|
697 |
+
|
698 |
+
FuzzyNotStdMatch = orchestrate_match_run(Matcher = copy.copy(InitialMatch), standardise = False, nnet = False, file_stub= "not_std_", df_name = df_name)
|
699 |
+
|
700 |
+
if FuzzyNotStdMatch.abort_flag == True:
|
701 |
+
message = "Nothing to match! Aborting address check."
|
702 |
+
print(message)
|
703 |
+
return message, InitialMatch
|
704 |
+
|
705 |
+
FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
|
706 |
+
|
707 |
+
if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
|
708 |
+
overall_toc = time.perf_counter()
|
709 |
+
time_out = f"The fuzzy match script took {overall_toc - overall_tic:0.1f} seconds"
|
710 |
+
FuzzyNotStdMatch.output_summary = FuzzyNotStdMatch.output_summary + " Neural net match not attempted. "# + time_out
|
711 |
+
return FuzzyNotStdMatch.output_summary, FuzzyNotStdMatch
|
712 |
+
|
713 |
+
''' Run fuzzy match on standardised dataset '''
|
714 |
+
|
715 |
+
progress(.25, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Fuzzy match - standardised dataset")
|
716 |
+
df_name = "Fuzzy standardised"
|
717 |
+
|
718 |
+
FuzzyStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyNotStdMatch), standardise = True, nnet = False, file_stub= "std_", df_name = df_name)
|
719 |
+
FuzzyStdMatch = combine_two_matches(FuzzyNotStdMatch, FuzzyStdMatch, df_name)
|
720 |
+
|
721 |
+
''' Continue if reference file in correct format, and neural net model exists. Also if data not too long '''
|
722 |
+
if ((len(FuzzyStdMatch.search_df_not_matched) == 0) | (FuzzyStdMatch.standard_llpg_format == False) |\
|
723 |
+
(os.path.exists(FuzzyStdMatch.model_dir_name + '/saved_model.zip') == False) | (run_nnet_match == False)):
|
724 |
+
overall_toc = time.perf_counter()
|
725 |
+
time_out = f"The fuzzy match script took {overall_toc - overall_tic:0.1f} seconds"
|
726 |
+
FuzzyStdMatch.output_summary = FuzzyStdMatch.output_summary + " Neural net match not attempted. "# + time_out
|
727 |
+
return FuzzyStdMatch.output_summary, FuzzyStdMatch
|
728 |
+
|
729 |
+
if run_nnet_match == True:
|
730 |
+
|
731 |
+
''' NEURAL NET '''
|
732 |
+
|
733 |
+
if run_fuzzy_match == False:
|
734 |
+
FuzzyStdMatch = copy.copy(InitialMatch)
|
735 |
+
overall_tic = time.perf_counter()
|
736 |
+
|
737 |
+
''' First on non-standardised addresses '''
|
738 |
+
progress(.50, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - non-standardised dataset")
|
739 |
+
df_name = "Neural net not standardised"
|
740 |
+
|
741 |
+
FuzzyNNetNotStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyStdMatch), standardise = False, nnet = True, file_stub= "nnet_not_std_", df_name = df_name)
|
742 |
+
FuzzyNNetNotStdMatch = combine_two_matches(FuzzyStdMatch, FuzzyNNetNotStdMatch, df_name)
|
743 |
+
|
744 |
+
if (len(FuzzyNNetNotStdMatch.search_df_not_matched) == 0):
|
745 |
+
overall_toc = time.perf_counter()
|
746 |
+
time_out = f"The whole match script took {overall_toc - overall_tic:0.1f} seconds"
|
747 |
+
FuzzyNNetNotStdMatch.output_summary = FuzzyNNetNotStdMatch.output_summary# + time_out
|
748 |
+
return FuzzyNNetNotStdMatch.output_summary, FuzzyNNetNotStdMatch
|
749 |
+
|
750 |
+
''' Next on standardised addresses '''
|
751 |
+
progress(.75, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - standardised dataset")
|
752 |
+
df_name = "Neural net standardised"
|
753 |
+
|
754 |
+
FuzzyNNetStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyNNetNotStdMatch), standardise = True, nnet = True, file_stub= "nnet_std_", df_name = df_name)
|
755 |
+
FuzzyNNetStdMatch = combine_two_matches(FuzzyNNetNotStdMatch, FuzzyNNetStdMatch, df_name)
|
756 |
+
|
757 |
+
if run_fuzzy_match == False:
|
758 |
+
overall_toc = time.perf_counter()
|
759 |
+
time_out = f"The neural net match script took {overall_toc - overall_tic:0.1f} seconds"
|
760 |
+
FuzzyNNetStdMatch.output_summary = FuzzyNNetStdMatch.output_summary + " Only Neural net match attempted. "# + time_out
|
761 |
+
return FuzzyNNetStdMatch.output_summary, FuzzyNNetStdMatch
|
762 |
+
|
763 |
+
overall_toc = time.perf_counter()
|
764 |
+
time_out = f"The whole match script took {overall_toc - overall_tic:0.1f} seconds"
|
765 |
+
|
766 |
+
summary_of_summaries = FuzzyNotStdMatch.output_summary + "\n" + FuzzyStdMatch.output_summary + "\n" + FuzzyNNetStdMatch.output_summary + "\n" + time_out
|
767 |
+
|
768 |
+
return summary_of_summaries, FuzzyNNetStdMatch
|
769 |
+
|
770 |
+
# Overarching functions
|
771 |
+
def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
|
772 |
+
|
773 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
774 |
+
|
775 |
+
#print(Matcher.standardise)
|
776 |
+
Matcher.standardise = standardise
|
777 |
+
|
778 |
+
if Matcher.search_df_not_matched.empty:
|
779 |
+
print("Nothing to match! At start of preparing run.")
|
780 |
+
return Matcher
|
781 |
+
|
782 |
+
if nnet == False:
|
783 |
+
diag_shortlist,\
|
784 |
+
diag_best_match,\
|
785 |
+
match_results_output,\
|
786 |
+
results_on_orig_df,\
|
787 |
+
summary,\
|
788 |
+
search_address_cols =\
|
789 |
+
full_fuzzy_match(Matcher.search_df_not_matched.copy(),
|
790 |
+
Matcher.standardise,
|
791 |
+
Matcher.search_df_key_field,
|
792 |
+
Matcher.search_address_cols,
|
793 |
+
Matcher.search_df_cleaned,
|
794 |
+
Matcher.search_df_after_stand,
|
795 |
+
Matcher.search_df_after_full_stand,
|
796 |
+
Matcher.ref_df_cleaned,
|
797 |
+
Matcher.ref_df_after_stand,
|
798 |
+
Matcher.ref_df_after_full_stand,
|
799 |
+
Matcher.fuzzy_match_limit,
|
800 |
+
Matcher.fuzzy_scorer_used)
|
801 |
+
if match_results_output.empty:
|
802 |
+
print("Match results empty")
|
803 |
+
Matcher.abort_flag = True
|
804 |
+
return Matcher
|
805 |
+
|
806 |
+
else:
|
807 |
+
Matcher.diag_shortlist = diag_shortlist
|
808 |
+
Matcher.diag_best_match = diag_best_match
|
809 |
+
Matcher.match_results_output = match_results_output
|
810 |
+
|
811 |
+
else:
|
812 |
+
match_results_output,\
|
813 |
+
results_on_orig_df,\
|
814 |
+
summary,\
|
815 |
+
predict_df_nnet =\
|
816 |
+
full_nn_match(
|
817 |
+
Matcher.ref_address_cols,
|
818 |
+
Matcher.search_df_not_matched.copy(),
|
819 |
+
Matcher.search_address_cols,
|
820 |
+
Matcher.search_df_key_field,
|
821 |
+
Matcher.standardise,
|
822 |
+
Matcher.exported_model[0],
|
823 |
+
Matcher.matching_variables,
|
824 |
+
Matcher.text_columns,
|
825 |
+
Matcher.weights,
|
826 |
+
Matcher.fuzzy_method,
|
827 |
+
Matcher.score_cut_off,
|
828 |
+
Matcher.match_results_output.copy(),
|
829 |
+
Matcher.filter_to_lambeth_pcodes,
|
830 |
+
Matcher.model_type,
|
831 |
+
Matcher.word_to_index,
|
832 |
+
Matcher.cat_to_idx,
|
833 |
+
Matcher.device,
|
834 |
+
Matcher.vocab,
|
835 |
+
Matcher.labels_list,
|
836 |
+
Matcher.search_df_cleaned,
|
837 |
+
Matcher.ref_df_after_stand,
|
838 |
+
Matcher.search_df_after_stand,
|
839 |
+
Matcher.search_df_after_full_stand)
|
840 |
+
|
841 |
+
if match_results_output.empty:
|
842 |
+
print("Match results empty")
|
843 |
+
Matcher.abort_flag = True
|
844 |
+
return Matcher
|
845 |
+
else:
|
846 |
+
Matcher.match_results_output = match_results_output
|
847 |
+
Matcher.predict_df_nnet = predict_df_nnet
|
848 |
+
|
849 |
+
# Save to file
|
850 |
+
Matcher.results_on_orig_df = results_on_orig_df
|
851 |
+
|
852 |
+
Matcher.summary = summary
|
853 |
+
|
854 |
+
Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
|
855 |
+
|
856 |
+
Matcher.match_outputs_name = "diagnostics_" + file_stub + today_rev + ".csv"
|
857 |
+
Matcher.results_orig_df_name = "results_" + file_stub + today_rev + ".csv"
|
858 |
+
|
859 |
+
Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
|
860 |
+
Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
|
861 |
+
|
862 |
+
return Matcher
|
863 |
+
|
864 |
+
# Overarching fuzzy match function
|
865 |
+
def full_fuzzy_match(search_df:PandasDataFrame,
|
866 |
+
standardise:bool,
|
867 |
+
search_df_key_field:str,
|
868 |
+
search_address_cols:List[str],
|
869 |
+
search_df_cleaned:PandasDataFrame,
|
870 |
+
search_df_after_stand:PandasDataFrame,
|
871 |
+
search_df_after_full_stand:PandasDataFrame,
|
872 |
+
ref_df_cleaned:PandasDataFrame,
|
873 |
+
ref_df_after_stand:PandasDataFrame,
|
874 |
+
ref_df_after_full_stand:PandasDataFrame,
|
875 |
+
fuzzy_match_limit:float,
|
876 |
+
fuzzy_scorer_used:str,
|
877 |
+
new_join_col:List[str]=["UPRN"],
|
878 |
+
fuzzy_search_addr_limit:float = 100,
|
879 |
+
filter_to_lambeth_pcodes:bool=False):
|
880 |
+
|
881 |
+
'''
|
882 |
+
Compare addresses in a 'search address' dataframe with a 'reference address' dataframe by using fuzzy matching from the rapidfuzz package, blocked by postcode and then street.
|
883 |
+
'''
|
884 |
+
|
885 |
+
# Break if search item has length 0
|
886 |
+
if search_df.empty:
|
887 |
+
out_error = "Nothing to match! Just started fuzzy match."
|
888 |
+
print(out_error)
|
889 |
+
return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error,search_address_cols
|
890 |
+
|
891 |
+
# If standardise is true, replace relevant variables with standardised versions
|
892 |
+
if standardise == True:
|
893 |
+
df_name = "standardised address"
|
894 |
+
search_df_after_stand = search_df_after_full_stand
|
895 |
+
ref_df_after_stand = ref_df_after_full_stand
|
896 |
+
else:
|
897 |
+
df_name = "non-standardised address"
|
898 |
+
|
899 |
+
# RUN WITH POSTCODE AS A BLOCKER #
|
900 |
+
# Fuzzy match against reference addresses
|
901 |
+
|
902 |
+
# Remove rows from ref search series where postcode is not found in the search_df
|
903 |
+
search_df_after_stand_series = search_df_after_stand.copy().set_index('postcode_search')['search_address_stand'].sort_index()
|
904 |
+
ref_df_after_stand_series = ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand'].sort_index()
|
905 |
+
|
906 |
+
#print(search_df_after_stand_series.index.tolist())
|
907 |
+
#print(ref_df_after_stand_series.index.tolist())
|
908 |
+
|
909 |
+
ref_df_after_stand_series_checked = ref_df_after_stand_series.copy()[ref_df_after_stand_series.index.isin(search_df_after_stand_series.index.tolist())]
|
910 |
+
|
911 |
+
# pd.DataFrame(ref_df_after_stand_series_checked.to_csv("ref_df_after_stand_series_checked.csv"))
|
912 |
+
|
913 |
+
if len(ref_df_after_stand_series_checked) == 0:
|
914 |
+
print("Nothing relevant in reference data to match!")
|
915 |
+
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),pd.DataFrame(),"Nothing relevant in reference data to match!",search_address_cols
|
916 |
+
|
917 |
+
# 'matched' is the list for which every single row is searched for in the reference list (the ref_df).
|
918 |
+
|
919 |
+
print("Starting the fuzzy match")
|
920 |
+
|
921 |
+
tic = time.perf_counter()
|
922 |
+
results = string_match_by_post_code_multiple(match_address_series = search_df_after_stand_series.copy(),
|
923 |
+
reference_address_series = ref_df_after_stand_series_checked,
|
924 |
+
search_limit = fuzzy_search_addr_limit,
|
925 |
+
scorer_name = fuzzy_scorer_used)
|
926 |
+
|
927 |
+
toc = time.perf_counter()
|
928 |
+
print(f"Performed the fuzzy match in {toc - tic:0.1f} seconds")
|
929 |
+
|
930 |
+
|
931 |
+
# Create result dfs
|
932 |
+
match_results_output, diag_shortlist, diag_best_match = _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cleaned, ref_df_after_stand, fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Postcode")
|
933 |
+
|
934 |
+
match_results_output['match_method'] = "Fuzzy match - postcode"
|
935 |
+
|
936 |
+
search_df_not_matched = filter_not_matched(match_results_output, search_df_after_stand, search_df_key_field)
|
937 |
+
|
938 |
+
|
939 |
+
# If nothing left to match, break
|
940 |
+
if (sum(match_results_output['full_match']==False) == 0) | (sum(match_results_output[match_results_output['full_match']==False]['fuzzy_score'])==0):
|
941 |
+
print("Nothing left to match!")
|
942 |
+
|
943 |
+
summary = create_match_summary(match_results_output, df_name)
|
944 |
+
|
945 |
+
if type(search_df) != str:
|
946 |
+
results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
947 |
+
else: results_on_orig_df = match_results_output
|
948 |
+
|
949 |
+
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
950 |
+
|
951 |
+
|
952 |
+
# RUN WITH STREET AS A BLOCKER #
|
953 |
+
|
954 |
+
### Redo with street as blocker
|
955 |
+
search_df_after_stand_street = search_df_not_matched.copy()
|
956 |
+
search_df_after_stand_street['search_address_stand_w_pcode'] = search_df_after_stand_street['search_address_stand'] + " " + search_df_after_stand_street['postcode_search']
|
957 |
+
ref_df_after_stand['ref_address_stand_w_pcode'] = ref_df_after_stand['ref_address_stand'] + " " + ref_df_after_stand['postcode_search']
|
958 |
+
|
959 |
+
search_df_after_stand_street['street']= search_df_after_stand_street['full_address_search'].apply(extract_street_name)
|
960 |
+
# Exclude non-postal addresses from street-blocked search
|
961 |
+
search_df_after_stand_street.loc[search_df_after_stand_street['Excluded from search'] == "Excluded - non-postal address", 'street'] = ""
|
962 |
+
|
963 |
+
### Create lookup lists
|
964 |
+
search_df_match_series_street = search_df_after_stand_street.copy().set_index('street')['search_address_stand']
|
965 |
+
ref_df_after_stand_series_street = ref_df_after_stand.copy().set_index('Street')['ref_address_stand']
|
966 |
+
|
967 |
+
# Remove rows where street is not in ref_df df
|
968 |
+
#index_check = ref_df_after_stand_series_street.index.isin(search_df_match_series_street.index)
|
969 |
+
#ref_df_after_stand_series_street_checked = ref_df_after_stand_series_street.copy()[index_check == True]
|
970 |
+
|
971 |
+
ref_df_after_stand_series_street_checked = ref_df_after_stand_series_street.copy()[ref_df_after_stand_series_street.index.isin(search_df_match_series_street.index.tolist())]
|
972 |
+
|
973 |
+
# If nothing left to match, break
|
974 |
+
if (len(ref_df_after_stand_series_street_checked) == 0) | ((len(search_df_match_series_street) == 0)):
|
975 |
+
|
976 |
+
summary = create_match_summary(match_results_output, df_name)
|
977 |
+
|
978 |
+
if type(search_df) != str:
|
979 |
+
results_on_orig_df = join_to_orig_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
|
980 |
+
else: results_on_orig_df = match_results_output
|
981 |
+
|
982 |
+
return diag_shortlist, diag_best_match,\
|
983 |
+
match_results_output, results_on_orig_df, summary, search_address_cols
|
984 |
+
|
985 |
+
print("Starting the fuzzy match with street as blocker")
|
986 |
+
|
987 |
+
tic = time.perf_counter()
|
988 |
+
results_st = string_match_by_post_code_multiple(match_address_series = search_df_match_series_street.copy(),
|
989 |
+
reference_address_series = ref_df_after_stand_series_street_checked.copy(),
|
990 |
+
search_limit = fuzzy_search_addr_limit,
|
991 |
+
scorer_name = fuzzy_scorer_used)
|
992 |
+
|
993 |
+
toc = time.perf_counter()
|
994 |
+
|
995 |
+
print(f"Performed the fuzzy match in {toc - tic:0.1f} seconds")
|
996 |
+
|
997 |
+
match_results_output_st, diag_shortlist_st, diag_best_match_st = _create_fuzzy_match_results_output(results_st, search_df_after_stand_street, ref_df_cleaned, ref_df_after_stand,\
|
998 |
+
fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Street")
|
999 |
+
match_results_output_st['match_method'] = "Fuzzy match - street"
|
1000 |
+
|
1001 |
+
match_results_output_st_out = combine_std_df_remove_dups(match_results_output, match_results_output_st, orig_addr_col = search_df_key_field)
|
1002 |
+
|
1003 |
+
match_results_output = match_results_output_st_out
|
1004 |
+
|
1005 |
+
summary = create_match_summary(match_results_output, df_name)
|
1006 |
+
|
1007 |
+
### Join URPN back onto orig df
|
1008 |
+
|
1009 |
+
if type(search_df) != str:
|
1010 |
+
results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
|
1011 |
+
else: results_on_orig_df = match_results_output
|
1012 |
+
|
1013 |
+
return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
|
1014 |
+
|
1015 |
+
# Overarching NN function
|
1016 |
+
def full_nn_match(ref_address_cols:List[str],
|
1017 |
+
search_df:PandasDataFrame,
|
1018 |
+
search_address_cols:List[str],
|
1019 |
+
search_df_key_field:str,
|
1020 |
+
standardise:bool,
|
1021 |
+
exported_model:list,
|
1022 |
+
matching_variables:List[str],
|
1023 |
+
text_columns:List[str],
|
1024 |
+
weights:dict,
|
1025 |
+
fuzzy_method:str,
|
1026 |
+
score_cut_off:float,
|
1027 |
+
match_results:PandasDataFrame,
|
1028 |
+
filter_to_lambeth_pcodes:bool,
|
1029 |
+
model_type:str,
|
1030 |
+
word_to_index:dict,
|
1031 |
+
cat_to_idx:dict,
|
1032 |
+
device:str,
|
1033 |
+
vocab:List[str],
|
1034 |
+
labels_list:List[str],
|
1035 |
+
search_df_cleaned:PandasDataFrame,
|
1036 |
+
ref_df_after_stand:PandasDataFrame,
|
1037 |
+
search_df_after_stand:PandasDataFrame,
|
1038 |
+
search_df_after_full_stand:PandasDataFrame,
|
1039 |
+
new_join_col:List=["UPRN"]):
|
1040 |
+
'''
|
1041 |
+
Use a neural network model to partition 'search addresses' into consituent parts in the format of UK Ordnance Survey Land Property Identifier (LPI) addresses. These address components are compared individually against reference addresses in the same format to give an overall match score using the recordlinkage package.
|
1042 |
+
'''
|
1043 |
+
|
1044 |
+
# Break if search item has length 0
|
1045 |
+
if search_df.empty:
|
1046 |
+
out_error = "Nothing to match!"
|
1047 |
+
print(out_error)
|
1048 |
+
return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error, search_address_cols
|
1049 |
+
|
1050 |
+
# If it is the standardisation step, or you have come from the fuzzy match area
|
1051 |
+
if (standardise == True): # | (run_fuzzy_match == True & standardise == False):
|
1052 |
+
df_name = "standardised address"
|
1053 |
+
|
1054 |
+
search_df_after_stand = search_df_after_full_stand
|
1055 |
+
|
1056 |
+
else:
|
1057 |
+
df_name = "non-standardised address"
|
1058 |
+
|
1059 |
+
print(search_df_after_stand.shape[0])
|
1060 |
+
print(ref_df_after_stand.shape[0])
|
1061 |
+
|
1062 |
+
# Predict on search data to extract LPI address components
|
1063 |
+
|
1064 |
+
#predict_len = len(search_df_cleaned["full_address"])
|
1065 |
+
all_columns = list(search_df_cleaned) # Creates list of all column headers
|
1066 |
+
search_df_cleaned[all_columns] = search_df_cleaned[all_columns].astype(str)
|
1067 |
+
predict_data = list(search_df_after_stand['search_address_stand'])
|
1068 |
+
|
1069 |
+
### Run predict function
|
1070 |
+
print("Starting neural net prediction for " + str(len(predict_data)) + " addresses")
|
1071 |
+
|
1072 |
+
tic = time.perf_counter()
|
1073 |
+
|
1074 |
+
# Determine the number of chunks
|
1075 |
+
num_chunks = math.ceil(len(predict_data) / max_predict_len)
|
1076 |
+
list_out_all = []
|
1077 |
+
predict_df_all = []
|
1078 |
+
|
1079 |
+
for i in range(num_chunks):
|
1080 |
+
print("Starting to predict batch " + str(i+ 1) + " of " + str(num_chunks) + " batches.")
|
1081 |
+
|
1082 |
+
start_idx = i * max_predict_len
|
1083 |
+
end_idx = start_idx + max_predict_len
|
1084 |
+
|
1085 |
+
# Extract the current chunk of data
|
1086 |
+
chunk_data = predict_data[start_idx:end_idx]
|
1087 |
+
|
1088 |
+
# Replace blank strings with a single space
|
1089 |
+
chunk_data = [" " if s in ("") else s for s in chunk_data]
|
1090 |
+
|
1091 |
+
if (model_type == "gru") | (model_type == "lstm"):
|
1092 |
+
list_out, predict_df = full_predict_torch(model=exported_model, model_type=model_type,
|
1093 |
+
input_text=chunk_data, word_to_index=word_to_index,
|
1094 |
+
cat_to_idx=cat_to_idx, device=device)
|
1095 |
+
else:
|
1096 |
+
list_out, predict_df = full_predict_func(chunk_data, exported_model, vocab, labels_list)
|
1097 |
+
|
1098 |
+
# Append the results
|
1099 |
+
list_out_all.extend(list_out)
|
1100 |
+
predict_df_all.append(predict_df)
|
1101 |
+
|
1102 |
+
# Concatenate all the results dataframes
|
1103 |
+
predict_df_all = pd.concat(predict_df_all, ignore_index=True)
|
1104 |
+
|
1105 |
+
toc = time.perf_counter()
|
1106 |
+
|
1107 |
+
print(f"Performed the NN prediction in {toc - tic:0.1f} seconds")
|
1108 |
+
|
1109 |
+
predict_df = post_predict_clean(predict_df=predict_df_all, orig_search_df=search_df_cleaned,
|
1110 |
+
ref_address_cols=ref_address_cols, search_df_key_field=search_df_key_field)
|
1111 |
+
|
1112 |
+
# Score-based matching between neural net predictions and fuzzy match results
|
1113 |
+
|
1114 |
+
# Example of recordlinkage package in use: https://towardsdatascience.com/how-to-perform-fuzzy-dataframe-row-matching-with-recordlinkage-b53ca0cb944c
|
1115 |
+
|
1116 |
+
## Run with Postcode as blocker column
|
1117 |
+
|
1118 |
+
blocker_column = ["Postcode"]
|
1119 |
+
|
1120 |
+
scoresSBM_best_pc, matched_output_SBM_pc = score_based_match(predict_df_search = predict_df.copy(), ref_search = ref_df_after_stand.copy(),
|
1121 |
+
orig_search_df = search_df_after_stand, matching_variables = matching_variables,
|
1122 |
+
text_columns = text_columns, blocker_column = blocker_column, weights = weights, fuzzy_method = fuzzy_method, score_cut_off = score_cut_off, search_df_key_field=search_df_key_field, standardise=standardise, new_join_col=new_join_col)
|
1123 |
+
|
1124 |
+
if matched_output_SBM_pc.empty:
|
1125 |
+
error_message = "Match results empty. Exiting neural net match."
|
1126 |
+
print(error_message)
|
1127 |
+
|
1128 |
+
return pd.DataFrame(),pd.DataFrame(), error_message, predict_df
|
1129 |
+
|
1130 |
+
else:
|
1131 |
+
matched_output_SBM_pc["match_method"] = "Neural net - Postcode"
|
1132 |
+
|
1133 |
+
match_results_output_final_pc = combine_std_df_remove_dups(match_results, matched_output_SBM_pc, orig_addr_col = search_df_key_field)
|
1134 |
+
|
1135 |
+
summary_pc = create_match_summary(match_results_output_final_pc, df_name = "NNet blocked by Postcode " + df_name)
|
1136 |
+
print(summary_pc)
|
1137 |
+
|
1138 |
+
## Run with Street as blocker column
|
1139 |
+
|
1140 |
+
blocker_column = ["Street"]
|
1141 |
+
|
1142 |
+
scoresSBM_best_st, matched_output_SBM_st = score_based_match(predict_df_search = predict_df.copy(), ref_search = ref_df_after_stand.copy(),
|
1143 |
+
orig_search_df = search_df_after_stand, matching_variables = matching_variables,
|
1144 |
+
text_columns = text_columns, blocker_column = blocker_column, weights = weights, fuzzy_method = fuzzy_method, score_cut_off = score_cut_off, search_df_key_field=search_df_key_field, standardise=standardise, new_join_col=new_join_col)
|
1145 |
+
|
1146 |
+
# If no matching pairs are found in the function above then it returns 0 - below we replace these values with the postcode blocker values (which should almost always find at least one pair unless it's a very unusual situation)
|
1147 |
+
if (type(matched_output_SBM_st) == int) | matched_output_SBM_st.empty:
|
1148 |
+
print("Nothing to match for street block")
|
1149 |
+
|
1150 |
+
matched_output_SBM_st = matched_output_SBM_pc
|
1151 |
+
matched_output_SBM_st["match_method"] = "Neural net - Postcode" #+ standard_label
|
1152 |
+
else: matched_output_SBM_st["match_method"] = "Neural net - Street" #+ standard_label
|
1153 |
+
|
1154 |
+
### Join together old match df with new (model) match df
|
1155 |
+
|
1156 |
+
match_results_output_final_st = combine_std_df_remove_dups(match_results_output_final_pc,matched_output_SBM_st, orig_addr_col = search_df_key_field)
|
1157 |
+
|
1158 |
+
summary_street = create_match_summary(match_results_output_final_st, df_name = "NNet blocked by Street " + df_name)
|
1159 |
+
print(summary_street)
|
1160 |
+
|
1161 |
+
# I decided in the end not to use PaoStartNumber as a blocker column. I get only a couple more matches in general for a big increase in processing time
|
1162 |
+
|
1163 |
+
matched_output_SBM_po = matched_output_SBM_st
|
1164 |
+
matched_output_SBM_po["match_method"] = "Neural net - Street" #+ standard_label
|
1165 |
+
|
1166 |
+
match_results_output_final_po = match_results_output_final_st
|
1167 |
+
match_results_output_final_three = match_results_output_final_po
|
1168 |
+
|
1169 |
+
summary_three = create_match_summary(match_results_output_final_three, df_name = "fuzzy and nn model street + postcode " + df_name)
|
1170 |
+
|
1171 |
+
### Join URPN back onto orig df
|
1172 |
+
|
1173 |
+
if type(search_df) != str:
|
1174 |
+
results_on_orig_df = join_to_orig_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
|
1175 |
+
else: results_on_orig_df = match_results_output_final_three
|
1176 |
+
|
1177 |
+
return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
|
1178 |
+
|
1179 |
+
|
1180 |
+
# Combiner/summary functions
|
1181 |
+
def combine_std_df_remove_dups(df_not_std, df_std, orig_addr_col = "search_orig_address", match_address_series = "full_match", keep_only_duplicated = False):
|
1182 |
+
|
1183 |
+
if (df_not_std.empty) & (df_std.empty):
|
1184 |
+
return df_not_std
|
1185 |
+
|
1186 |
+
combined_std_not_matches = pd.concat([df_not_std, df_std])#, ignore_index=True)
|
1187 |
+
|
1188 |
+
if combined_std_not_matches.empty: #| ~(match_address_series in combined_std_not_matches.columns) | ~(orig_addr_col in combined_std_not_matches.columns):
|
1189 |
+
combined_std_not_matches[match_address_series] = False
|
1190 |
+
|
1191 |
+
if "full_address" in combined_std_not_matches.columns:
|
1192 |
+
combined_std_not_matches[orig_addr_col] = combined_std_not_matches["full_address"]
|
1193 |
+
combined_std_not_matches["fuzzy_score"] = 0
|
1194 |
+
return combined_std_not_matches
|
1195 |
+
|
1196 |
+
combined_std_not_matches = combined_std_not_matches.sort_values([orig_addr_col, match_address_series], ascending=False)
|
1197 |
+
|
1198 |
+
if keep_only_duplicated == True:
|
1199 |
+
combined_std_not_matches = combined_std_not_matches[combined_std_not_matches.duplicated(orig_addr_col)]
|
1200 |
+
|
1201 |
+
combined_std_not_matches_no_dups = combined_std_not_matches.drop_duplicates(orig_addr_col).sort_index()
|
1202 |
+
|
1203 |
+
return combined_std_not_matches_no_dups
|
1204 |
+
|
1205 |
+
def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
|
1206 |
+
|
1207 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
1208 |
+
|
1209 |
+
NewMatchClass.match_results_output = combine_std_df_remove_dups(OrigMatchClass.match_results_output, NewMatchClass.match_results_output, orig_addr_col = NewMatchClass.search_df_key_field)
|
1210 |
+
|
1211 |
+
NewMatchClass.results_on_orig_df = combine_std_df_remove_dups(OrigMatchClass.pre_filter_search_df, NewMatchClass.results_on_orig_df, orig_addr_col = NewMatchClass.search_df_key_field, match_address_series = 'Matched with reference address')
|
1212 |
+
|
1213 |
+
|
1214 |
+
# Filter out search results where a match was found
|
1215 |
+
NewMatchClass.pre_filter_search_df = NewMatchClass.results_on_orig_df
|
1216 |
+
|
1217 |
+
found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
|
1218 |
+
#print(found_index)[NewMatchClass.search_df_key_field]
|
1219 |
+
|
1220 |
+
key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
|
1221 |
+
rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
|
1222 |
+
NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
|
1223 |
+
|
1224 |
+
# Filter out rows from NewMatchClass.search_df_cleaned
|
1225 |
+
|
1226 |
+
filtered_rows_to_keep = NewMatchClass.search_df_cleaned[NewMatchClass.search_df_key_field].astype(int).isin(NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int)).to_list()
|
1227 |
+
|
1228 |
+
NewMatchClass.search_df_cleaned = NewMatchClass.search_df_cleaned.loc[filtered_rows_to_keep,:]#.drop(rows_to_drop, axis = 0)
|
1229 |
+
NewMatchClass.search_df_after_stand = NewMatchClass.search_df_after_stand.loc[filtered_rows_to_keep,:]#.drop(rows_to_drop)
|
1230 |
+
NewMatchClass.search_df_after_full_stand = NewMatchClass.search_df_after_full_stand.loc[filtered_rows_to_keep,:]#.drop(rows_to_drop)
|
1231 |
+
|
1232 |
+
### Create lookup lists
|
1233 |
+
NewMatchClass.search_df_after_stand_series = NewMatchClass.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand'].str.lower().str.strip()
|
1234 |
+
NewMatchClass.search_df_after_stand_series_full_stand = NewMatchClass.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand'].str.lower().str.strip()
|
1235 |
+
|
1236 |
+
|
1237 |
+
match_results_output_match_score_is_0 = NewMatchClass.match_results_output[NewMatchClass.match_results_output['fuzzy_score']==0.0][["index", "fuzzy_score"]].drop_duplicates(subset='index')
|
1238 |
+
match_results_output_match_score_is_0["index"] = match_results_output_match_score_is_0["index"].astype(str)
|
1239 |
+
#NewMatchClass.results_on_orig_df["index"] = NewMatchClass.results_on_orig_df["index"].astype(str)
|
1240 |
+
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.merge(match_results_output_match_score_is_0, on = "index", how = "left")
|
1241 |
+
|
1242 |
+
NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["fuzzy_score"] == 0.0, "Excluded from search"] = "Match score is 0"
|
1243 |
+
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
|
1244 |
+
|
1245 |
+
# Drop any duplicates, prioritise any matches
|
1246 |
+
NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
1247 |
+
|
1248 |
+
NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
|
1249 |
+
print(NewMatchClass.output_summary)
|
1250 |
+
|
1251 |
+
|
1252 |
+
NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
|
1253 |
+
|
1254 |
+
### Rejoin the excluded matches onto the output file
|
1255 |
+
#NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
|
1256 |
+
|
1257 |
+
NewMatchClass.match_outputs_name = "match_results_output_std_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
|
1258 |
+
NewMatchClass.results_orig_df_name = "results_on_orig_df_std_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
|
1259 |
+
|
1260 |
+
# Only keep essential columns
|
1261 |
+
essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
|
1262 |
+
essential_results_cols.extend(NewMatchClass.new_join_col)
|
1263 |
+
|
1264 |
+
NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
|
1265 |
+
NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)
|
1266 |
+
|
1267 |
+
return NewMatchClass
|
1268 |
+
|
1269 |
+
def create_match_summary(match_results_output:PandasDataFrame, df_name:str):
|
1270 |
+
|
1271 |
+
# Check if match_results_output is a dictionary-like object and has the key 'full_match'
|
1272 |
+
|
1273 |
+
if not isinstance(match_results_output, dict) or 'full_match' not in match_results_output or (len(match_results_output) == 0):
|
1274 |
+
"Nothing in match_results_output"
|
1275 |
+
full_match_count = 0
|
1276 |
+
match_fail_count = 0
|
1277 |
+
records_attempted = 0
|
1278 |
+
dataset_length = 0
|
1279 |
+
records_not_attempted = 0
|
1280 |
+
match_rate = 0
|
1281 |
+
match_fail_count_without_excluded = 0
|
1282 |
+
match_fail_rate = 0
|
1283 |
+
not_attempted_rate = 0
|
1284 |
+
|
1285 |
+
''' Create a summary paragraph '''
|
1286 |
+
full_match_count = match_results_output['full_match'][match_results_output['full_match'] == True].count()
|
1287 |
+
match_fail_count = match_results_output['full_match'][match_results_output['full_match'] == False].count()
|
1288 |
+
records_attempted = int(sum((match_results_output['fuzzy_score']!=0.0) & ~(match_results_output['fuzzy_score'].isna())))
|
1289 |
+
dataset_length = len(match_results_output["full_match"])
|
1290 |
+
records_not_attempted = int(dataset_length - records_attempted)
|
1291 |
+
match_rate = str(round((full_match_count / dataset_length) * 100,1))
|
1292 |
+
match_fail_count_without_excluded = match_fail_count - records_not_attempted
|
1293 |
+
match_fail_rate = str(round(((match_fail_count_without_excluded) / dataset_length) * 100,1))
|
1294 |
+
not_attempted_rate = str(round((records_not_attempted / dataset_length) * 100,1))
|
1295 |
+
|
1296 |
+
summary = ("For the " + df_name + " dataset (" + str(dataset_length) + " records), the fuzzy matching algorithm successfully matched " + str(full_match_count) +
|
1297 |
+
" records (" + match_rate + "%). The algorithm could not attempt to match " + str(records_not_attempted) +
|
1298 |
+
" records (" + not_attempted_rate + "%). There are " + str(match_fail_count_without_excluded) + " records left to potentially match.")
|
1299 |
+
|
1300 |
+
return summary
|
tools/model_predict.py
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#import tensorflow as tf # Tensorflow use deprecated
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from typing import Type, Dict, List, Tuple
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
PandasDataFrame = Type[pd.DataFrame]
|
9 |
+
PandasSeries = Type[pd.Series]
|
10 |
+
MatchedResults = Dict[str,Tuple[str,int]]
|
11 |
+
array = List[str]
|
12 |
+
|
13 |
+
today = datetime.now().strftime("%d%m%Y")
|
14 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
+
|
16 |
+
# # Neural net functions
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
def vocab_lookup(characters: str, vocab) -> (int, np.ndarray):
|
23 |
+
"""
|
24 |
+
Taken from the function from the addressnet package by Jason Rigby
|
25 |
+
|
26 |
+
Converts a string into a list of vocab indices
|
27 |
+
:param characters: the string to convert
|
28 |
+
:param training: if True, artificial typos will be introduced
|
29 |
+
:return: the string length and an array of vocab indices
|
30 |
+
"""
|
31 |
+
result = list()
|
32 |
+
for c in characters.lower():
|
33 |
+
try:
|
34 |
+
result.append(vocab.index(c) + 1)
|
35 |
+
except ValueError:
|
36 |
+
result.append(0)
|
37 |
+
return len(characters), np.array(result, dtype=np.int64)
|
38 |
+
|
39 |
+
|
40 |
+
# ## Neural net predictor functions
|
41 |
+
|
42 |
+
def text_to_model_input_local(in_text, vocab, model_type = "estimator"):
|
43 |
+
addresses_out = []
|
44 |
+
model_input_out = []
|
45 |
+
encoded_text = []
|
46 |
+
|
47 |
+
# Calculate longest string length
|
48 |
+
import heapq
|
49 |
+
|
50 |
+
# get the index of the largest element in the list
|
51 |
+
index = heapq.nlargest(1, range(len(in_text)), key=lambda x: len(in_text[x]))[0]
|
52 |
+
|
53 |
+
# use the index to get the corresponding string
|
54 |
+
longest_string = len(in_text[index])
|
55 |
+
|
56 |
+
#print("Longest string is: " + str(longest_string))
|
57 |
+
|
58 |
+
for x in range(0, len(in_text)):
|
59 |
+
|
60 |
+
out = vocab_lookup(in_text[x], vocab)
|
61 |
+
addresses_out.append(out)
|
62 |
+
|
63 |
+
#print(out)
|
64 |
+
|
65 |
+
# Tensorflow model use deprecated
|
66 |
+
# if model_type == "estimator":
|
67 |
+
# model_input_add= tf.train.Example(features=tf.train.Features(feature={
|
68 |
+
# 'lengths': tf.train.Feature(int64_list=tf.train.Int64List(value=[out[0]])),
|
69 |
+
# 'encoded_text': tf.train.Feature(int64_list=tf.train.Int64List(value=out[1].tolist()))
|
70 |
+
# })).SerializeToString()
|
71 |
+
|
72 |
+
# model_input_out.append(model_input_add)
|
73 |
+
|
74 |
+
if model_type == "keras":
|
75 |
+
encoded_text.append(out[1])
|
76 |
+
|
77 |
+
# Tensorflow model use deprecated
|
78 |
+
# if model_type == "keras":
|
79 |
+
# # Pad out the strings so they're all the same length. 69 seems to be the value for spaces
|
80 |
+
# model_input_out = tf.keras.utils.pad_sequences(encoded_text, maxlen=longest_string, padding="post", truncating="post", value=0)#69)
|
81 |
+
|
82 |
+
|
83 |
+
return addresses_out, model_input_out
|
84 |
+
|
85 |
+
|
86 |
+
def reformat_predictions_local(predict_out):
|
87 |
+
|
88 |
+
predictions_list_reformat = []
|
89 |
+
|
90 |
+
for x in range(0,len(predict_out['pred_output_classes'])):
|
91 |
+
|
92 |
+
new_entry = {'class_ids': predict_out['pred_output_classes'][x], 'probabilities': predict_out['probabilities'][x]}
|
93 |
+
predictions_list_reformat.append(new_entry)
|
94 |
+
|
95 |
+
return predictions_list_reformat
|
96 |
+
|
97 |
+
|
98 |
+
def predict_serve_conv_local(in_text:List[str], labels_list, predictions) -> List[Dict[str, str]]:
|
99 |
+
|
100 |
+
class_names = [l.replace("_code", "") for l in labels_list]
|
101 |
+
class_names = [l.replace("_abbreviation", "") for l in class_names]
|
102 |
+
|
103 |
+
#print(input_text)
|
104 |
+
|
105 |
+
#print(list(zip(input_text, predictions)))
|
106 |
+
|
107 |
+
for addr, res in zip(in_text, predictions):
|
108 |
+
|
109 |
+
#print(zip(input_text, predictions))
|
110 |
+
|
111 |
+
mappings = dict()
|
112 |
+
|
113 |
+
|
114 |
+
#print(addr.upper())
|
115 |
+
#print(res['class_ids'])
|
116 |
+
|
117 |
+
for char, class_id in zip(addr.upper(), res['class_ids']):
|
118 |
+
#print(char)
|
119 |
+
if class_id == 0:
|
120 |
+
continue
|
121 |
+
cls = class_names[class_id - 1]
|
122 |
+
mappings[cls] = mappings.get(cls, "") + char
|
123 |
+
|
124 |
+
|
125 |
+
#print(mappings)
|
126 |
+
yield mappings
|
127 |
+
#return mappings
|
128 |
+
|
129 |
+
|
130 |
+
def prep_predict_export(prediction_outputs, in_text):
|
131 |
+
|
132 |
+
out_list = list(prediction_outputs)
|
133 |
+
|
134 |
+
df_out = pd.DataFrame(out_list)
|
135 |
+
|
136 |
+
#print(in_text)
|
137 |
+
#print(df_out)
|
138 |
+
|
139 |
+
df_out["address"] = in_text
|
140 |
+
|
141 |
+
return out_list, df_out
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
def full_predict_func(list_to_predict, model, vocab, labels_list):
|
146 |
+
|
147 |
+
if hasattr(model, "summary"): # Indicates this is a keras model rather than an estimator
|
148 |
+
model_type = "keras"
|
149 |
+
else: model_type = "estimator"
|
150 |
+
|
151 |
+
list_to_predict = [x.upper() for x in list_to_predict]
|
152 |
+
|
153 |
+
addresses_out, model_input = text_to_model_input_local(list_to_predict, vocab, model_type)
|
154 |
+
|
155 |
+
if hasattr(model, "summary"):
|
156 |
+
probs = model.predict(model_input, use_multiprocessing=True)
|
157 |
+
|
158 |
+
classes = probs.argmax(axis=-1)
|
159 |
+
|
160 |
+
predictions = {'pred_output_classes':classes, 'probabilities':probs}
|
161 |
+
|
162 |
+
else:
|
163 |
+
print("Tensorflow use deprecated")
|
164 |
+
#predictions = model.signatures["predict_output"](predictor_inputs=tf.constant(model_input)) # This was for when using the contrib module
|
165 |
+
#predictions = model.signatures["serving_default"](predictor_inputs=tf.constant(model_input))
|
166 |
+
|
167 |
+
predictions_list_reformat = reformat_predictions_local(predictions)
|
168 |
+
|
169 |
+
|
170 |
+
#### Final output as list or dataframe
|
171 |
+
|
172 |
+
output = predict_serve_conv_local(list(list_to_predict), labels_list, predictions_list_reformat)
|
173 |
+
|
174 |
+
list_out, predict_df = prep_predict_export(output, list_to_predict)
|
175 |
+
|
176 |
+
# Add organisation as a column if it doesn't already exist
|
177 |
+
if 'Organisation' not in predict_df.columns:
|
178 |
+
predict_df['Organisation'] = ""
|
179 |
+
|
180 |
+
return list_out, predict_df
|
181 |
+
|
182 |
+
# -
|
183 |
+
|
184 |
+
def predict_torch(model, model_type, input_text, word_to_index, device):
|
185 |
+
#print(device)
|
186 |
+
|
187 |
+
# Convert input_text to tensor of character indices
|
188 |
+
indexed_texts = [[word_to_index.get(char, word_to_index['<UNK>']) for char in text] for text in input_text]
|
189 |
+
|
190 |
+
# Calculate max_len based on indexed_texts
|
191 |
+
max_len = max(len(text) for text in indexed_texts)
|
192 |
+
|
193 |
+
# Pad sequences and convert to tensor
|
194 |
+
padded_texts = torch.tensor([text + [word_to_index['<pad>']] * (max_len - len(text)) for text in indexed_texts])
|
195 |
+
|
196 |
+
with torch.no_grad():
|
197 |
+
texts = padded_texts.to(device)
|
198 |
+
|
199 |
+
if (model_type == "lstm") | (model_type == "gru"):
|
200 |
+
text_lengths = texts.ne(word_to_index['<pad>']).sum(dim=1)
|
201 |
+
predictions = model(texts, text_lengths)
|
202 |
+
|
203 |
+
if model_type == "transformer":
|
204 |
+
# Call model with texts and pad_idx
|
205 |
+
predictions = model(texts, word_to_index['<pad>'])
|
206 |
+
|
207 |
+
# Convert predictions to most likely category indices
|
208 |
+
_, predicted_indices = predictions.max(2)
|
209 |
+
return predicted_indices
|
210 |
+
|
211 |
+
|
212 |
+
def torch_predictions_to_dicts(input_text, predicted_indices, index_to_category):
|
213 |
+
results = []
|
214 |
+
for i, text in enumerate(input_text):
|
215 |
+
# Treat each character in the input text as a "token"
|
216 |
+
tokens = list(text) # Convert string to a list of characters
|
217 |
+
|
218 |
+
# Create a dictionary for the current text
|
219 |
+
curr_dict = {}
|
220 |
+
|
221 |
+
# Iterate over the predicted categories and the tokens together
|
222 |
+
for category_index, token in zip(predicted_indices[i], tokens):
|
223 |
+
# Convert the category index to its name
|
224 |
+
category_name = index_to_category[category_index.item()]
|
225 |
+
|
226 |
+
# Append the token to the category in the dictionary (or create the category if it doesn't exist)
|
227 |
+
if category_name in curr_dict:
|
228 |
+
curr_dict[category_name] += token # No space needed between characters
|
229 |
+
else:
|
230 |
+
curr_dict[category_name] = token
|
231 |
+
|
232 |
+
results.append(curr_dict)
|
233 |
+
|
234 |
+
return results
|
235 |
+
|
236 |
+
|
237 |
+
def torch_prep_predict_export(prediction_outputs, in_text):
|
238 |
+
|
239 |
+
#out_list = list(prediction_outputs)
|
240 |
+
|
241 |
+
df_out = pd.DataFrame(prediction_outputs).drop("IGNORE", axis = 1)
|
242 |
+
|
243 |
+
#print(in_text)
|
244 |
+
#print(df_out)
|
245 |
+
|
246 |
+
df_out["address"] = in_text
|
247 |
+
|
248 |
+
return df_out
|
249 |
+
|
250 |
+
|
251 |
+
def full_predict_torch(model, model_type, input_text, word_to_index, cat_to_idx, device):
|
252 |
+
|
253 |
+
input_text = [x.upper() for x in input_text]
|
254 |
+
|
255 |
+
predicted_indices = predict_torch(model, model_type, input_text, word_to_index, device)
|
256 |
+
|
257 |
+
index_to_category = {v: k for k, v in cat_to_idx.items()}
|
258 |
+
|
259 |
+
results_dict = torch_predictions_to_dicts(input_text, predicted_indices, index_to_category)
|
260 |
+
|
261 |
+
df_out = torch_prep_predict_export(results_dict, input_text)
|
262 |
+
|
263 |
+
return results_dict, df_out
|
264 |
+
|
265 |
+
|
266 |
+
def post_predict_clean(predict_df, orig_search_df, ref_address_cols, search_df_key_field):
|
267 |
+
|
268 |
+
|
269 |
+
# Add address to ref_address_cols
|
270 |
+
ref_address_cols_add = ref_address_cols.copy()
|
271 |
+
ref_address_cols_add.extend(['address'])
|
272 |
+
|
273 |
+
# Create column if it doesn't exist
|
274 |
+
for x in ref_address_cols:
|
275 |
+
|
276 |
+
predict_df[x] = predict_df.get(x, np.nan)
|
277 |
+
|
278 |
+
predict_df = predict_df[ref_address_cols_add]
|
279 |
+
|
280 |
+
#Columns that are in the ref and model, but are not matched in this instance, need to be filled in with blanks
|
281 |
+
|
282 |
+
predict_cols_match = list(predict_df.drop(["address"],axis=1).columns)
|
283 |
+
predict_cols_match_uprn = predict_cols_match.copy()
|
284 |
+
predict_cols_match_uprn.append("UPRN")
|
285 |
+
|
286 |
+
pred_output_missing_cols = list(set(ref_address_cols) - set(predict_cols_match))
|
287 |
+
predict_df[pred_output_missing_cols] = np.nan
|
288 |
+
predict_df = predict_df.fillna("").infer_objects(copy=False)
|
289 |
+
|
290 |
+
#Convert all columns to string
|
291 |
+
|
292 |
+
all_columns = list(predict_df) # Creates list of all column headers
|
293 |
+
predict_df[all_columns] = predict_df[all_columns].astype(str)
|
294 |
+
|
295 |
+
predict_df = predict_df.replace("\.0","",regex=True)
|
296 |
+
|
297 |
+
#When comparing with ref, the postcode existing in the data will be used to compare rather than the postcode predicted by the model. This is to minimise errors in matching
|
298 |
+
|
299 |
+
predict_df = predict_df.rename(columns={"Postcode":"Postcode_predict"})
|
300 |
+
|
301 |
+
#orig_search_df.to_csv("orig_search_df_pre_predict.csv")
|
302 |
+
|
303 |
+
orig_search_df_pc = orig_search_df[[search_df_key_field, "postcode"]].rename(columns={"postcode":"Postcode"}).reset_index(drop=True)
|
304 |
+
predict_df = predict_df.merge(orig_search_df_pc, left_index=True, right_index=True, how = "left")
|
305 |
+
|
306 |
+
#predict_df = pd.concat([predict_df, orig_search_df_pc], axis = 1)
|
307 |
+
|
308 |
+
#predict_df[search_df_key_field] = orig_search_df[search_df_key_field]
|
309 |
+
|
310 |
+
#predict_df = predict_df.drop("index", axis=1)
|
311 |
+
|
312 |
+
#predict_df['index'] = predict_df.index
|
313 |
+
predict_df[search_df_key_field] = predict_df[search_df_key_field].astype(str)
|
314 |
+
|
315 |
+
#predict_df.to_csv("predict_end_of_clean.csv")
|
316 |
+
|
317 |
+
return predict_df
|
318 |
+
|
tools/preparation.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import Type, Dict, List, Tuple
|
3 |
+
from datetime import datetime
|
4 |
+
#import polars as pl
|
5 |
+
import re
|
6 |
+
|
7 |
+
PandasDataFrame = Type[pd.DataFrame]
|
8 |
+
PandasSeries = Type[pd.Series]
|
9 |
+
MatchedResults = Dict[str,Tuple[str,int]]
|
10 |
+
array = List[str]
|
11 |
+
|
12 |
+
today = datetime.now().strftime("%d%m%Y")
|
13 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
14 |
+
|
15 |
+
|
16 |
+
def prepare_search_address_string(
|
17 |
+
search_str: str
|
18 |
+
) -> Tuple[pd.DataFrame, str, List[str], List[str]]:
|
19 |
+
"""Extracts address and postcode from search_str into new DataFrame"""
|
20 |
+
|
21 |
+
# Validate input
|
22 |
+
if not isinstance(search_str, str):
|
23 |
+
raise TypeError("search_str must be a string")
|
24 |
+
|
25 |
+
search_df = pd.DataFrame(data={"full_address":[search_str]})
|
26 |
+
|
27 |
+
#print(search_df)
|
28 |
+
|
29 |
+
# Extract postcode
|
30 |
+
postcode_series = extract_postcode(search_df, "full_address").dropna(axis=1)[0]
|
31 |
+
|
32 |
+
# Remove postcode from address
|
33 |
+
address_series = remove_postcode(search_df, "full_address")
|
34 |
+
|
35 |
+
# Construct output DataFrame
|
36 |
+
search_df_out = pd.DataFrame()
|
37 |
+
search_df_out["full_address"] = address_series
|
38 |
+
search_df_out["postcode"] = postcode_series
|
39 |
+
|
40 |
+
# Set key field for joining
|
41 |
+
key_field = "index"
|
42 |
+
|
43 |
+
# Reset index to use as key field
|
44 |
+
search_df_out = search_df_out.reset_index()
|
45 |
+
|
46 |
+
# Define column names to return
|
47 |
+
address_cols = ["full_address"]
|
48 |
+
postcode_col = ["postcode"]
|
49 |
+
|
50 |
+
return search_df_out, key_field, address_cols, postcode_col
|
51 |
+
|
52 |
+
# def prepare_search_address(
|
53 |
+
# search_df: pd.DataFrame,
|
54 |
+
# address_cols: list,
|
55 |
+
# postcode_col: list,
|
56 |
+
# key_col: str
|
57 |
+
# ) -> Tuple[pd.DataFrame, str]:
|
58 |
+
|
59 |
+
# # Validate inputs
|
60 |
+
# if not isinstance(search_df, pd.DataFrame):
|
61 |
+
# raise TypeError("search_df must be a Pandas DataFrame")
|
62 |
+
|
63 |
+
# if not isinstance(address_cols, list):
|
64 |
+
# raise TypeError("address_cols must be a list")
|
65 |
+
|
66 |
+
# if not isinstance(postcode_col, list):
|
67 |
+
# raise TypeError("postcode_col must be a list")
|
68 |
+
|
69 |
+
# if not isinstance(key_col, str):
|
70 |
+
# raise TypeError("key_col must be a string")
|
71 |
+
|
72 |
+
# # Clean address columns
|
73 |
+
# clean_addresses = _clean_columns(search_df, address_cols)
|
74 |
+
|
75 |
+
# # Join address columns into one
|
76 |
+
# full_addresses = _join_address(clean_addresses, address_cols)
|
77 |
+
|
78 |
+
# # Add postcode column
|
79 |
+
# full_df = _add_postcode_column(full_addresses, postcode_col)
|
80 |
+
|
81 |
+
# # Remove postcode from main address if there was only one column in the input
|
82 |
+
# if postcode_col == "full_address_postcode":
|
83 |
+
# # Remove postcode from address
|
84 |
+
# address_series = remove_postcode(search_df, "full_address")
|
85 |
+
# search_df["full_address"] == address_series
|
86 |
+
|
87 |
+
# # Ensure index column
|
88 |
+
# final_df = _ensure_index(full_df, key_col)
|
89 |
+
|
90 |
+
# #print(final_df)
|
91 |
+
|
92 |
+
|
93 |
+
# return final_df, key_col
|
94 |
+
|
95 |
+
def prepare_search_address(
|
96 |
+
search_df: pd.DataFrame,
|
97 |
+
address_cols: list,
|
98 |
+
postcode_col: list,
|
99 |
+
key_col: str
|
100 |
+
) -> Tuple[pd.DataFrame, str]:
|
101 |
+
|
102 |
+
# Validate inputs
|
103 |
+
if not isinstance(search_df, pd.DataFrame):
|
104 |
+
raise TypeError("search_df must be a Pandas DataFrame")
|
105 |
+
|
106 |
+
if not isinstance(address_cols, list):
|
107 |
+
raise TypeError("address_cols must be a list")
|
108 |
+
|
109 |
+
if not isinstance(postcode_col, list):
|
110 |
+
raise TypeError("postcode_col must be a list")
|
111 |
+
|
112 |
+
if not isinstance(key_col, str):
|
113 |
+
raise TypeError("key_col must be a string")
|
114 |
+
|
115 |
+
# Clean address columns
|
116 |
+
#search_df_polars = pl.from_dataframe(search_df)
|
117 |
+
clean_addresses = _clean_columns(search_df, address_cols)
|
118 |
+
|
119 |
+
# Join address columns into one
|
120 |
+
full_addresses = _join_address(clean_addresses, address_cols)
|
121 |
+
|
122 |
+
# Add postcode column
|
123 |
+
full_df = _add_postcode_column(full_addresses, postcode_col)
|
124 |
+
|
125 |
+
# Remove postcode from main address if there was only one column in the input
|
126 |
+
if postcode_col == "full_address_postcode":
|
127 |
+
# Remove postcode from address
|
128 |
+
address_series = remove_postcode(search_df, "full_address")
|
129 |
+
search_df["full_address"] == address_series
|
130 |
+
|
131 |
+
# Ensure index column
|
132 |
+
final_df = _ensure_index(full_df, key_col)
|
133 |
+
|
134 |
+
#print(final_df)
|
135 |
+
|
136 |
+
|
137 |
+
return final_df
|
138 |
+
|
139 |
+
# Helper functions
|
140 |
+
def _clean_columns(df, cols):
|
141 |
+
# Cleaning logic
|
142 |
+
def clean_col(col):
|
143 |
+
return col.astype(str).fillna("").infer_objects(copy=False).str.replace("nan","").str.replace("\s{2,}", " ", regex=True).str.replace(","," ").str.strip()
|
144 |
+
|
145 |
+
df[cols] = df[cols].apply(clean_col)
|
146 |
+
|
147 |
+
return df
|
148 |
+
|
149 |
+
# def _clean_columns(df, cols):
|
150 |
+
# # Cleaning logic
|
151 |
+
# #print(df)
|
152 |
+
|
153 |
+
# #if isinstance(df, pl.DataFrame):
|
154 |
+
# # print("It's a Polars DataFrame")
|
155 |
+
|
156 |
+
# def clean_col(col):
|
157 |
+
# col = col.str.replace("nan", "")
|
158 |
+
# col = col.apply(lambda x: re.sub(r'\s{2,}', ' ', str(x)), skip_nulls=False, return_dtype=str) # replace any spaces greater than one with one
|
159 |
+
# return col.str.replace(",", " ").str.strip() # replace commas with a space
|
160 |
+
|
161 |
+
# for col in cols:
|
162 |
+
# df = df.with_columns(clean_col(df[col]).alias(col))
|
163 |
+
|
164 |
+
# return df
|
165 |
+
|
166 |
+
|
167 |
+
def _join_address(df, cols):
|
168 |
+
# Joining logic
|
169 |
+
full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
170 |
+
df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
|
171 |
+
|
172 |
+
return df
|
173 |
+
|
174 |
+
def _add_postcode_column(df, postcodes):
|
175 |
+
# Add postcode column
|
176 |
+
if isinstance(postcodes, list):
|
177 |
+
postcodes = postcodes[0]
|
178 |
+
|
179 |
+
if postcodes != "full_address_postcode":
|
180 |
+
df = df.rename(columns={postcodes:"postcode"})
|
181 |
+
else:
|
182 |
+
#print(df["full_address_postcode"])
|
183 |
+
#print(extract_postcode(df,"full_address_postcode"))
|
184 |
+
df["full_address_postcode"] = extract_postcode(df,"full_address_postcode")[0] #
|
185 |
+
df = df.rename(columns={postcodes:"postcode"})
|
186 |
+
#print(df)
|
187 |
+
|
188 |
+
return df
|
189 |
+
|
190 |
+
def _ensure_index(df, index_col):
|
191 |
+
# Ensure index column exists
|
192 |
+
if ((index_col == "index") & ~("index" in df.columns)):
|
193 |
+
print("Resetting index in _ensure_index function")
|
194 |
+
df = df.reset_index()
|
195 |
+
|
196 |
+
df[index_col] = df[index_col].astype(str)
|
197 |
+
|
198 |
+
return df
|
199 |
+
|
200 |
+
def create_full_address(df):
|
201 |
+
|
202 |
+
df = df.fillna("").infer_objects(copy=False)
|
203 |
+
|
204 |
+
if "Organisation" not in df.columns:
|
205 |
+
df["Organisation"] = ""
|
206 |
+
|
207 |
+
df["full_address"] = df['Organisation'] + " " + df['SaoText'].str.replace(" - ", " REPL ").str.replace("- ", " REPLEFT ").str.replace(" -", " REPLRIGHT ") + " " + df["SaoStartNumber"].astype(str) + df["SaoStartSuffix"] + "-" + df["SaoEndNumber"].astype(str) + df["SaoEndSuffix"] + " " + df["PaoText"].str.replace(" - ", " REPL ").str.replace("- ", " REPLEFT ").str.replace(" -", " REPLRIGHT ") + " " + df["PaoStartNumber"].astype(str) + df["PaoStartSuffix"] + "-" + df["PaoEndNumber"].astype(str) + df["PaoEndSuffix"] + " " + df["Street"] + " " + df["PostTown"] + " " + df["Postcode"]
|
208 |
+
|
209 |
+
#.str.replace(r'(?<=[a-zA-Z])-(?![a-zA-Z])|(?<![a-zA-Z])-(?=[a-zA-Z])', ' ', regex=True)\
|
210 |
+
|
211 |
+
#.str.replace(".0","", regex=False)\
|
212 |
+
|
213 |
+
df["full_address"] = df["full_address"]\
|
214 |
+
.str.replace("-999","")\
|
215 |
+
.str.replace(" -"," ")\
|
216 |
+
.str.replace("- "," ")\
|
217 |
+
.str.replace(" REPL "," - ")\
|
218 |
+
.str.replace(" REPLEFT ","- ")\
|
219 |
+
.str.replace(" REPLRIGHT "," -")\
|
220 |
+
.str.replace("\s+"," ", regex=True)\
|
221 |
+
.str.strip()
|
222 |
+
#.str.replace(" "," ")\
|
223 |
+
|
224 |
+
return df["full_address"]
|
225 |
+
|
226 |
+
def prepare_ref_address(ref_df, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
|
227 |
+
|
228 |
+
if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
|
229 |
+
else: standard_cols = False
|
230 |
+
|
231 |
+
ref_address_cols_uprn = ref_address_cols.copy()
|
232 |
+
|
233 |
+
ref_address_cols_uprn.extend(new_join_col)
|
234 |
+
ref_address_cols_uprn_w_ref = ref_address_cols_uprn.copy()
|
235 |
+
ref_address_cols_uprn_w_ref.extend(["Reference file"])
|
236 |
+
|
237 |
+
ref_df_cleaned = ref_df.copy()
|
238 |
+
|
239 |
+
# In on-prem LPI db street has been excluded, so put this back in
|
240 |
+
if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
|
241 |
+
ref_df_cleaned['Street'] = ref_df_cleaned['Address_LPI'].str.replace("\\n", " ", regex = True).apply(extract_street_name)#
|
242 |
+
|
243 |
+
if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns):
|
244 |
+
ref_df_cleaned['Organisation'] = ""
|
245 |
+
|
246 |
+
ref_df_cleaned = ref_df_cleaned[ref_address_cols_uprn_w_ref]
|
247 |
+
|
248 |
+
ref_df_cleaned = ref_df_cleaned.fillna("").infer_objects(copy=False)
|
249 |
+
|
250 |
+
all_columns = list(ref_df_cleaned) # Creates list of all column headers
|
251 |
+
ref_df_cleaned[all_columns] = ref_df_cleaned[all_columns].astype(str).fillna('').infer_objects(copy=False).replace('nan','')
|
252 |
+
|
253 |
+
ref_df_cleaned = ref_df_cleaned.replace("\.0","",regex=True)
|
254 |
+
|
255 |
+
# Create full address
|
256 |
+
|
257 |
+
all_columns = list(ref_df_cleaned) # Creates list of all column headers
|
258 |
+
ref_df_cleaned[all_columns] = ref_df_cleaned[all_columns].astype(str)
|
259 |
+
|
260 |
+
ref_df_cleaned = ref_df_cleaned.replace("nan","")
|
261 |
+
ref_df_cleaned = ref_df_cleaned.replace("\.0","",regex=True)
|
262 |
+
|
263 |
+
if standard_cols == True:
|
264 |
+
ref_df_cleaned= ref_df_cleaned[ref_address_cols_uprn_w_ref].fillna('').infer_objects(copy=False)
|
265 |
+
|
266 |
+
ref_df_cleaned["fulladdress"] = create_full_address(ref_df_cleaned[ref_address_cols_uprn_w_ref])
|
267 |
+
|
268 |
+
else:
|
269 |
+
ref_df_cleaned= ref_df_cleaned[ref_address_cols_uprn_w_ref].fillna('').infer_objects(copy=False)
|
270 |
+
|
271 |
+
full_address = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
|
272 |
+
ref_df_cleaned["fulladdress"] = full_address
|
273 |
+
|
274 |
+
ref_df_cleaned["fulladdress"] = ref_df_cleaned["fulladdress"]\
|
275 |
+
.str.replace("-999","")\
|
276 |
+
.str.replace(" -"," ")\
|
277 |
+
.str.replace("- "," ")\
|
278 |
+
.str.replace(".0","", regex=False)\
|
279 |
+
.str.replace("\s{2,}", " ", regex=True)\
|
280 |
+
.str.strip()
|
281 |
+
|
282 |
+
# Create a street column if it doesn't exist by extracting street from the full address
|
283 |
+
|
284 |
+
if 'Street' not in ref_df_cleaned.columns:
|
285 |
+
ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
|
286 |
+
|
287 |
+
# Add index column
|
288 |
+
ref_df_cleaned['ref_index'] = ref_df_cleaned.index
|
289 |
+
|
290 |
+
return ref_df_cleaned
|
291 |
+
|
292 |
+
# def prepare_ref_address(ref_df:pl.DataFrame, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
|
293 |
+
|
294 |
+
# if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns):
|
295 |
+
# standard_cols = True
|
296 |
+
# else:
|
297 |
+
# standard_cols = False
|
298 |
+
|
299 |
+
# ref_address_cols_uprn = list(ref_address_cols) + new_join_col
|
300 |
+
# ref_df_cleaned = ref_df[ref_address_cols_uprn].fill_null("")
|
301 |
+
|
302 |
+
# # In on-prem LPI db street has been excluded, so put this back in
|
303 |
+
# if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
|
304 |
+
# ref_df_cleaned = ref_df_cleaned.with_column(pl.col('Address_LPI').apply(lambda x: extract_street_name(x)).alias('Street'))
|
305 |
+
|
306 |
+
# if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns):
|
307 |
+
# ref_df_cleaned = ref_df_cleaned.with_column(pl.lit("").alias('Organisation'))
|
308 |
+
|
309 |
+
# #ref_df_cleaned['fulladdress'] =
|
310 |
+
|
311 |
+
# if standard_cols:
|
312 |
+
# pass
|
313 |
+
# # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
|
314 |
+
# # However, you might need to convert string types to object type for full address creation which may require more than just a few lines of codes.
|
315 |
+
# else:
|
316 |
+
# pass
|
317 |
+
|
318 |
+
# # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
|
319 |
+
|
320 |
+
# if 'Street' not in ref_df_cleaned.columns:
|
321 |
+
# ref_df_cleaned = ref_df_cleaned.with_column(pl.col('fulladdress').apply(extract_street_name).alias("Street"))
|
322 |
+
|
323 |
+
# # Add index column
|
324 |
+
# ref_df_cleaned = ref_df_cleaned.with_column(pl.lit('').alias('ref_index'))
|
325 |
+
|
326 |
+
# return ref_df_cleaned
|
327 |
+
|
328 |
+
|
329 |
+
def extract_postcode(df, col:str) -> PandasSeries:
|
330 |
+
'''
|
331 |
+
Extract a postcode from a string column in a dataframe
|
332 |
+
'''
|
333 |
+
postcode_series = df[col].str.upper().str.extract(pat = \
|
334 |
+
"(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$)")
|
335 |
+
|
336 |
+
return postcode_series
|
337 |
+
|
338 |
+
|
339 |
+
# Remove addresses with no numbers in at all - too high a risk of badly assigning an address
|
340 |
+
def check_no_number_addresses(df, in_address_series) -> PandasSeries:
|
341 |
+
'''
|
342 |
+
Highlight addresses from a pandas df where there are no numbers in the address.
|
343 |
+
'''
|
344 |
+
df["in_address_series_temp"] = df[in_address_series].str.lower()
|
345 |
+
|
346 |
+
no_numbers_series = df["in_address_series_temp"].str.contains("^(?!.*\d+).*$", regex=True)
|
347 |
+
|
348 |
+
df.loc[no_numbers_series == True, 'Excluded from search'] = "Excluded - no numbers in address"
|
349 |
+
|
350 |
+
df = df.drop("in_address_series_temp", axis = 1)
|
351 |
+
|
352 |
+
#print(df[["full_address", "Excluded from search"]])
|
353 |
+
|
354 |
+
return df
|
355 |
+
|
356 |
+
|
357 |
+
def remove_postcode(df, col:str) -> PandasSeries:
|
358 |
+
'''
|
359 |
+
Remove a postcode from a string column in a dataframe
|
360 |
+
'''
|
361 |
+
address_series_no_pcode = df[col].str.upper().str.replace(\
|
362 |
+
"\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
|
363 |
+
|
364 |
+
return address_series_no_pcode
|
365 |
+
|
366 |
+
def extract_street_name(address:str) -> str:
|
367 |
+
"""
|
368 |
+
Extracts the street name from the given address.
|
369 |
+
|
370 |
+
Args:
|
371 |
+
address (str): The input address string.
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
str: The extracted street name, or an empty string if no match is found.
|
375 |
+
|
376 |
+
Examples:
|
377 |
+
>>> address1 = "1 Ash Park Road SE54 3HB"
|
378 |
+
>>> extract_street_name(address1)
|
379 |
+
'Ash Park Road'
|
380 |
+
|
381 |
+
>>> address2 = "Flat 14 1 Ash Park Road SE54 3HB"
|
382 |
+
>>> extract_street_name(address2)
|
383 |
+
'Ash Park Road'
|
384 |
+
|
385 |
+
>>> address3 = "123 Main Blvd"
|
386 |
+
>>> extract_street_name(address3)
|
387 |
+
'Main Blvd'
|
388 |
+
|
389 |
+
>>> address4 = "456 Maple AvEnUe"
|
390 |
+
>>> extract_street_name(address4)
|
391 |
+
'Maple AvEnUe'
|
392 |
+
|
393 |
+
>>> address5 = "789 Oak Street"
|
394 |
+
>>> extract_street_name(address5)
|
395 |
+
'Oak Street'
|
396 |
+
"""
|
397 |
+
|
398 |
+
|
399 |
+
street_types = [
|
400 |
+
'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
|
401 |
+
'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
|
402 |
+
'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
|
403 |
+
'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
|
404 |
+
'Alley', 'Arcade','Avenue', 'Ave','Bay','Bend','Brae','Byway','Close','Corner','Cove',
|
405 |
+
'Crescent', 'Cres','Cul-de-sac','Dell','Drive', 'Dr','Esplanade','Glen','Green','Grove','Heights', 'Hts',
|
406 |
+
'Mews','Parade','Path','Piazza','Promenade','Quay','Ridge','Row','Terrace', 'Ter','Track','Trail','View','Villas',
|
407 |
+
'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
|
408 |
+
]
|
409 |
+
|
410 |
+
# Dynamically construct the regex pattern with all possible street types
|
411 |
+
street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
|
412 |
+
|
413 |
+
# The overall regex pattern to capture the street name
|
414 |
+
pattern = rf'(?:\d+\s+|\w+\s+\d+\s+|.*\d+[a-z]+\s+|.*\d+\s+)*(?P<street_name>[\w\s]+(?:{street_types_pattern}))'
|
415 |
+
|
416 |
+
def replace_postcode(address):
|
417 |
+
pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$'
|
418 |
+
return re.sub(pattern, "", address)
|
419 |
+
|
420 |
+
|
421 |
+
modified_address = replace_postcode(address.upper())
|
422 |
+
#print(modified_address)
|
423 |
+
#print(address)
|
424 |
+
|
425 |
+
# Perform a case-insensitive search
|
426 |
+
match = re.search(pattern, modified_address, re.IGNORECASE)
|
427 |
+
|
428 |
+
if match:
|
429 |
+
street_name = match.group('street_name')
|
430 |
+
return street_name.strip()
|
431 |
+
else:
|
432 |
+
return ""
|
433 |
+
|
434 |
+
|
435 |
+
# Exclude non-postal addresses
|
436 |
+
|
437 |
+
def remove_non_postal(df, in_address_series):
|
438 |
+
'''
|
439 |
+
Highlight non-postal addresses from a polars df where a string series that contain specific substrings
|
440 |
+
indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
|
441 |
+
'''
|
442 |
+
df["in_address_series_temp"] = df[in_address_series].str.lower()
|
443 |
+
|
444 |
+
garage_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bgarage\\b|\\bgarages\\b)", regex=True)
|
445 |
+
parking_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bparking\\b)", regex=True)
|
446 |
+
shed_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bshed\\b|\\bsheds\\b)", regex=True)
|
447 |
+
bike_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbike\\b|\\bbikes\\b)", regex=True)
|
448 |
+
bicycle_store_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbicycle store\\b|\\bbicycle store\\b)", regex=True)
|
449 |
+
|
450 |
+
non_postal_series = (garage_address_series | parking_address_series | shed_address_series | bike_address_series | bicycle_store_address_series)
|
451 |
+
|
452 |
+
df.loc[non_postal_series == True, 'Excluded from search'] = "Excluded - non-postal address"
|
453 |
+
|
454 |
+
df = df.drop("in_address_series_temp", axis = 1)
|
455 |
+
|
456 |
+
return df
|
tools/pytorch_models.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
|
3 |
+
class TextClassifier(nn.Module):
|
4 |
+
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
|
5 |
+
dropout, pad_idx):
|
6 |
+
super(TextClassifier, self).__init__()
|
7 |
+
|
8 |
+
# Embedding layer
|
9 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
|
10 |
+
|
11 |
+
# GRU layers
|
12 |
+
self.rnn = nn.GRU(embedding_dim,
|
13 |
+
hidden_dim,
|
14 |
+
num_layers=n_layers,
|
15 |
+
bidirectional=True,
|
16 |
+
dropout=dropout,
|
17 |
+
batch_first=True)
|
18 |
+
|
19 |
+
# Fully connected layer
|
20 |
+
self.fc = nn.Linear(hidden_dim * 2, output_dim) # Multiply by 2 for bidirection
|
21 |
+
|
22 |
+
# Dropout layer
|
23 |
+
self.dropout = nn.Dropout(dropout)
|
24 |
+
|
25 |
+
def forward(self, text, text_lengths):
|
26 |
+
embedded = self.dropout(self.embedding(text))
|
27 |
+
|
28 |
+
# Pack sequence
|
29 |
+
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
|
30 |
+
packed_output, _ = self.rnn(packed_embedded)
|
31 |
+
|
32 |
+
# Unpack sequence
|
33 |
+
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
|
34 |
+
|
35 |
+
# Pass the entire output tensor to the FC layer for token-level classification
|
36 |
+
return self.fc(output)
|
37 |
+
|
38 |
+
class LSTMTextClassifier(nn.Module):
|
39 |
+
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
|
40 |
+
dropout, pad_idx):
|
41 |
+
super(LSTMTextClassifier, self).__init__()
|
42 |
+
|
43 |
+
# Embedding layer
|
44 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
|
45 |
+
|
46 |
+
# LSTM layers
|
47 |
+
self.rnn = nn.LSTM(embedding_dim,
|
48 |
+
hidden_dim,
|
49 |
+
num_layers=n_layers,
|
50 |
+
bidirectional=True,
|
51 |
+
dropout=dropout,
|
52 |
+
batch_first=True)
|
53 |
+
|
54 |
+
# Fully connected layer
|
55 |
+
self.fc = nn.Linear(hidden_dim * 2, output_dim) # Multiply by 2 for bidirection
|
56 |
+
|
57 |
+
# Dropout layer
|
58 |
+
self.dropout = nn.Dropout(dropout)
|
59 |
+
|
60 |
+
def forward(self, text, text_lengths):
|
61 |
+
embedded = self.dropout(self.embedding(text))
|
62 |
+
|
63 |
+
# Pack sequence
|
64 |
+
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
|
65 |
+
|
66 |
+
# Note: LSTM returns both the output and a tuple of (hidden state, cell state)
|
67 |
+
packed_output, (hidden, cell) = self.rnn(packed_embedded)
|
68 |
+
|
69 |
+
# Unpack sequence
|
70 |
+
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
|
71 |
+
|
72 |
+
# Pass the entire output tensor to the FC layer for token-level classification
|
73 |
+
return self.fc(output)
|
74 |
+
|
75 |
+
class PositionalEncoding(nn.Module):
|
76 |
+
def __init__(self, d_model, max_len=120):
|
77 |
+
super(PositionalEncoding, self).__init__()
|
78 |
+
self.d_model = d_model
|
79 |
+
|
80 |
+
def forward(self, x):
|
81 |
+
# If pe doesn't exist or its sequence length is different from x's sequence length
|
82 |
+
if not hasattr(self, 'pe') or self.pe.size(0) != x.size(1):
|
83 |
+
max_len = x.size(1)
|
84 |
+
pe = torch.zeros(max_len, self.d_model)
|
85 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
86 |
+
div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
|
87 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
88 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
89 |
+
pe = pe.unsqueeze(0)
|
90 |
+
self.register_buffer('pe', pe.to(x.device))
|
91 |
+
|
92 |
+
return x + self.pe[:, :x.size(1), :]
|
93 |
+
|
94 |
+
import torch.nn as nn
|
95 |
+
import torch.nn.init as init
|
96 |
+
|
97 |
+
def weights_init_kaiming(m):
|
98 |
+
if isinstance(m, nn.Linear):
|
99 |
+
init.kaiming_uniform_(m.weight, nonlinearity='relu')
|
100 |
+
if m.bias is not None:
|
101 |
+
init.zeros_(m.bias)
|
102 |
+
elif isinstance(m, nn.Embedding):
|
103 |
+
init.kaiming_uniform_(m.weight, nonlinearity='relu')
|
104 |
+
|
105 |
+
class TransformerClassifier(nn.Module):
|
106 |
+
def __init__(self, vocab_size, embedding_dim, nhead, num_encoder_layers,
|
107 |
+
num_classes, dropout, pad_idx):
|
108 |
+
super(TransformerClassifier, self).__init__()
|
109 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
# Embedding layer
|
114 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
|
115 |
+
|
116 |
+
# Positional encoding
|
117 |
+
self.pos_encoder = PositionalEncoding(embedding_dim)
|
118 |
+
|
119 |
+
# Transformer with dropout
|
120 |
+
transformer_encoder = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead, dropout=dropout, activation="gelu")
|
121 |
+
self.transformer = nn.TransformerEncoder(transformer_encoder, num_layers=num_encoder_layers)
|
122 |
+
|
123 |
+
# Classifier with dropout
|
124 |
+
self.classifier = nn.Sequential(
|
125 |
+
nn.Dropout(dropout),
|
126 |
+
nn.Linear(embedding_dim, num_classes)
|
127 |
+
)
|
128 |
+
|
129 |
+
def create_attention_mask(self, src, pad_idx):
|
130 |
+
return (src == pad_idx)
|
131 |
+
|
132 |
+
def forward(self, src, pad_idx):
|
133 |
+
|
134 |
+
# Check pad_idx
|
135 |
+
if isinstance(pad_idx, torch.Tensor) and torch.numel(pad_idx) > 1:
|
136 |
+
raise ValueError("Expected pad_idx to be a scalar value, but got a tensor with multiple elements.")
|
137 |
+
|
138 |
+
# Transpose src to have shape (seq_len, batch_size)
|
139 |
+
src = src.transpose(0, 1)
|
140 |
+
|
141 |
+
# Embedding
|
142 |
+
x = self.embedding(src)
|
143 |
+
|
144 |
+
# Positional Encoding
|
145 |
+
x = self.pos_encoder(x.to(self.device))
|
146 |
+
|
147 |
+
# Create attention mask
|
148 |
+
src_key_padding_mask = self.create_attention_mask(src.transpose(0, 1), pad_idx) # Transpose back to (batch_size, sequence_length)
|
149 |
+
|
150 |
+
# Transformer
|
151 |
+
x = self.transformer(x, src_key_padding_mask=src_key_padding_mask)
|
152 |
+
|
153 |
+
#print(model.state_dict())
|
154 |
+
# Classification
|
155 |
+
return self.classifier(x)
|
tools/recordlinkage_funcs.py
ADDED
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import Type, Dict, List, Tuple
|
3 |
+
import recordlinkage
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
PandasDataFrame = Type[pd.DataFrame]
|
7 |
+
PandasSeries = Type[pd.Series]
|
8 |
+
MatchedResults = Dict[str,Tuple[str,int]]
|
9 |
+
array = List[str]
|
10 |
+
|
11 |
+
today = datetime.now().strftime("%d%m%Y")
|
12 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
13 |
+
|
14 |
+
from tools.constants import score_cut_off_nnet_street
|
15 |
+
|
16 |
+
# ## Recordlinkage matching functions
|
17 |
+
def compute_match(predict_df_search, ref_search, orig_search_df, matching_variables,
|
18 |
+
text_columns, blocker_column, weights, fuzzy_method):
|
19 |
+
# Use the merge command to match group1 and group2
|
20 |
+
predict_df_search[matching_variables] = predict_df_search[matching_variables].astype(str)
|
21 |
+
ref_search[matching_variables] = ref_search[matching_variables].astype(str).replace("-999","")
|
22 |
+
|
23 |
+
# SaoText needs to be exactly the same to get a 'full' match. So I moved that to the exact match group
|
24 |
+
exact_columns = list(set(matching_variables) - set(text_columns))
|
25 |
+
|
26 |
+
# Replace all blanks with a space, so they can be included in the fuzzy match searches
|
27 |
+
for column in text_columns:
|
28 |
+
predict_df_search.loc[predict_df_search[column] == '', column] = ' '
|
29 |
+
ref_search.loc[ref_search[column] == '', column] = ' '
|
30 |
+
|
31 |
+
# Score based match functions
|
32 |
+
|
33 |
+
# Create an index of all pairs
|
34 |
+
indexer = recordlinkage.Index()
|
35 |
+
|
36 |
+
# Block on selected blocker column
|
37 |
+
|
38 |
+
## Remove all NAs from predict_df blocker column
|
39 |
+
if blocker_column[0] == "PaoStartNumber":
|
40 |
+
predict_df_search = predict_df_search[~(predict_df_search[blocker_column[0]].isna()) & ~(predict_df_search[blocker_column[0]] == '')& ~(predict_df_search[blocker_column[0]].str.contains(r'^\s*$', na=False))]
|
41 |
+
|
42 |
+
|
43 |
+
indexer.block(blocker_column) #matchkey.block(["Postcode", "PaoStartNumber"])
|
44 |
+
|
45 |
+
# Generate candidate pairs
|
46 |
+
|
47 |
+
pairsSBM = indexer.index(predict_df_search,ref_search)
|
48 |
+
|
49 |
+
print('Running with ' + blocker_column[0] + ' as blocker has created', len(pairsSBM), 'pairs.')
|
50 |
+
|
51 |
+
# If no pairs are found, break
|
52 |
+
if len(pairsSBM) == 0: return pd.DataFrame()
|
53 |
+
|
54 |
+
# Call the compare class from the toolkit
|
55 |
+
compareSBM = recordlinkage.Compare()
|
56 |
+
|
57 |
+
# Assign variables to matching technique - exact
|
58 |
+
for columns in exact_columns:
|
59 |
+
compareSBM.exact(columns, columns, label = columns, missing_value = 0)
|
60 |
+
|
61 |
+
# Assign variables to matching technique - fuzzy
|
62 |
+
for columns in text_columns:
|
63 |
+
if columns == "Postcode":
|
64 |
+
compareSBM.string(columns, columns, label = columns, missing_value = 0, method = "levenshtein")
|
65 |
+
else:
|
66 |
+
compareSBM.string(columns, columns, label = columns, missing_value = 0, method = fuzzy_method)
|
67 |
+
|
68 |
+
## Run the match - compare each column within the blocks according to exact or fuzzy matching (defined in cells above)
|
69 |
+
|
70 |
+
scoresSBM = compareSBM.compute(pairs = pairsSBM, x = predict_df_search, x_link = ref_search)
|
71 |
+
|
72 |
+
return scoresSBM
|
73 |
+
|
74 |
+
def calc_final_nnet_scores(scoresSBM, weights, matching_variables):
|
75 |
+
#Modify the output scores by the weights set at the start of the code
|
76 |
+
scoresSBM_w = scoresSBM*weights
|
77 |
+
|
78 |
+
### Determine matched roles that score above a threshold
|
79 |
+
|
80 |
+
# Sum all columns
|
81 |
+
scoresSBM_r = scoresSBM_w
|
82 |
+
|
83 |
+
scoresSBM_r['score'] = scoresSBM_r[matching_variables].sum(axis = 1)
|
84 |
+
scoresSBM_r['score_max'] = sum(weights.values()) # + 2 for the additional scoring from the weighted variables a couple of cells above
|
85 |
+
scoresSBM_r['score_perc'] = (scoresSBM_r['score'] / scoresSBM_r['score_max'])*100
|
86 |
+
|
87 |
+
scoresSBM_r = scoresSBM_r.reset_index()
|
88 |
+
|
89 |
+
# Rename the index if misnamed
|
90 |
+
scoresSBM_r = scoresSBM_r.rename(columns={"index":"level_1"}, errors = "ignore")
|
91 |
+
|
92 |
+
# Sort all comparisons by score in descending order
|
93 |
+
scoresSBM_r = scoresSBM_r.sort_values(by=["level_0","score_perc"], ascending = False)
|
94 |
+
|
95 |
+
# Within each search address, remove anything below the max
|
96 |
+
#scoresSBM_r.to_csv("scoresSBM_r.csv")
|
97 |
+
scoresSBM_g = scoresSBM_r.reset_index()
|
98 |
+
|
99 |
+
# Get maximum score to join on
|
100 |
+
scoresSBM_g = scoresSBM_g.groupby("level_0").max("score_perc").reset_index()[["level_0", "score_perc"]]
|
101 |
+
scoresSBM_g =scoresSBM_g.rename(columns={"score_perc":"score_perc_max"})
|
102 |
+
scoresSBM_search = scoresSBM_r.merge(scoresSBM_g, on = "level_0", how="left")
|
103 |
+
|
104 |
+
scoresSBM_search['score_perc'] = round(scoresSBM_search['score_perc'],1).astype(float)
|
105 |
+
scoresSBM_search['score_perc_max'] = round(scoresSBM_search['score_perc_max'],1).astype(float)
|
106 |
+
|
107 |
+
return scoresSBM_search
|
108 |
+
|
109 |
+
def join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search):
|
110 |
+
## Join back search and ref_df address details onto matching df
|
111 |
+
scoresSBM_search_m_j = scoresSBM_search_m.merge(ref_search, left_on="level_1", right_index=True, how = "left", suffixes=("", "_ref"))
|
112 |
+
|
113 |
+
scoresSBM_search_m_j = scoresSBM_search_m_j.merge(predict_df_search, left_on="level_0", right_index=True,how="left", suffixes=("", "_pred"))
|
114 |
+
|
115 |
+
scoresSBM_search_m_j = scoresSBM_search_m_j.reindex(sorted(scoresSBM_search_m_j.columns), axis=1)
|
116 |
+
|
117 |
+
#scoresSBM_search_m_j.to_csv("scoresSBM_search_m_j.csv")
|
118 |
+
|
119 |
+
return scoresSBM_search_m_j
|
120 |
+
|
121 |
+
def rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise):
|
122 |
+
|
123 |
+
start_columns = new_join_col.copy()
|
124 |
+
|
125 |
+
start_columns.extend(["address", "fulladdress", "level_0", "level_1","score","score_max","score_perc","score_perc_max"])
|
126 |
+
|
127 |
+
other_columns = list(set(scoresSBM_search_m_j.columns) - set(start_columns))
|
128 |
+
|
129 |
+
all_columns_order = start_columns.copy()
|
130 |
+
all_columns_order.extend(sorted(other_columns))
|
131 |
+
|
132 |
+
|
133 |
+
# Place important columns at start
|
134 |
+
|
135 |
+
scoresSBM_search_m_j = scoresSBM_search_m_j.reindex(all_columns_order, axis=1)
|
136 |
+
|
137 |
+
scoresSBM_search_m_j = scoresSBM_search_m_j.rename(columns={'address':'address_pred',
|
138 |
+
'fulladdress':'address_ref',
|
139 |
+
'level_0':'index_pred',
|
140 |
+
'level_1':'index_ref',
|
141 |
+
'score':'match_score',
|
142 |
+
'score_max':'max_possible_score',
|
143 |
+
'score_perc':'perc_weighted_columns_matched',
|
144 |
+
'score_perc_max':'perc_weighted_columns_matched_max_for_pred_address'})
|
145 |
+
|
146 |
+
scoresSBM_search_m_j = scoresSBM_search_m_j.sort_values("index_pred", ascending = True)
|
147 |
+
|
148 |
+
# ref_index is just a duplicate of index_ref, needed for outputs
|
149 |
+
scoresSBM_search_m_j["ref_index"] = scoresSBM_search_m_j["index_ref"]
|
150 |
+
|
151 |
+
#search_df_j = orig_search_df[["full_address_search", search_df_key_field]]
|
152 |
+
|
153 |
+
#scoresSBM_out = scoresSBM_search_m_j.merge(search_df_j, left_on = "address_pred", right_on = "full_address_search", how = "left")
|
154 |
+
|
155 |
+
final_cols = new_join_col.copy()
|
156 |
+
final_cols.extend([search_df_key_field, 'full_match_score_based', 'address_pred', 'address_ref',\
|
157 |
+
'match_score', 'max_possible_score', 'perc_weighted_columns_matched',\
|
158 |
+
'perc_weighted_columns_matched_max_for_pred_address',\
|
159 |
+
'Organisation', 'Organisation_ref', 'Organisation_pred',\
|
160 |
+
'SaoText', 'SaoText_ref', 'SaoText_pred',\
|
161 |
+
'SaoStartNumber', 'SaoStartNumber_ref', 'SaoStartNumber_pred',\
|
162 |
+
'SaoStartSuffix', 'SaoStartSuffix_ref', 'SaoStartSuffix_pred',\
|
163 |
+
'SaoEndNumber', 'SaoEndNumber_ref', 'SaoEndNumber_pred',\
|
164 |
+
'SaoEndSuffix', 'SaoEndSuffix_ref', 'SaoEndSuffix_pred',\
|
165 |
+
'PaoStartNumber', 'PaoStartNumber_ref', 'PaoStartNumber_pred',\
|
166 |
+
'PaoStartSuffix', 'PaoStartSuffix_ref', 'PaoStartSuffix_pred',\
|
167 |
+
'PaoEndNumber', 'PaoEndNumber_ref', 'PaoEndNumber_pred',\
|
168 |
+
'PaoEndSuffix', 'PaoEndSuffix_ref', 'PaoEndSuffix_pred',\
|
169 |
+
'PaoText', 'PaoText_ref', 'PaoText_pred',\
|
170 |
+
'Street', 'Street_ref', 'Street_pred',\
|
171 |
+
'PostTown', 'PostTown_ref', 'PostTown_pred',\
|
172 |
+
'Postcode', 'Postcode_ref', 'Postcode_pred', 'Postcode_predict',\
|
173 |
+
'index_pred', 'index_ref', 'Reference file'
|
174 |
+
])
|
175 |
+
|
176 |
+
scoresSBM_out = scoresSBM_search_m_j[final_cols]
|
177 |
+
|
178 |
+
#scoresSBM_out.to_csv("scoresSBM_out" + "_" + blocker_column[0] + "_" + str(standardise) + ".csv")
|
179 |
+
|
180 |
+
return scoresSBM_out, start_columns
|
181 |
+
|
182 |
+
def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off):
|
183 |
+
|
184 |
+
#scoresSBM_best.to_csv("scores_sbm_best_" + str(standardise) + ".csv")
|
185 |
+
|
186 |
+
### Make the final 'matched output' file
|
187 |
+
scoresSBM_best_pred_cols = scoresSBM_best.filter(regex='_pred$').iloc[:,1:-1]
|
188 |
+
scoresSBM_best["search_orig_address"] = (scoresSBM_best_pred_cols.agg(' '.join, axis=1)).str.strip().str.replace("\s{2,}", " ", regex=True)
|
189 |
+
|
190 |
+
scoresSBM_best_ref_cols = scoresSBM_best.filter(regex='_ref$').iloc[:,1:-1]
|
191 |
+
scoresSBM_best['reference_mod_address'] = (scoresSBM_best_ref_cols.agg(' '.join, axis=1)).str.strip().str.replace("\s{2,}", " ", regex=True)
|
192 |
+
|
193 |
+
## Create matched output df
|
194 |
+
matched_output_SBM = orig_search_df[[search_df_key_field, "full_address", "postcode", "property_number", "prop_number", "flat_number", "apart_number", "block_number", 'unit_number', "room_number", "house_court_name"]].replace(r"\bnan\b", "", regex=True).infer_objects(copy=False)
|
195 |
+
matched_output_SBM[search_df_key_field] = matched_output_SBM[search_df_key_field].astype(str)
|
196 |
+
|
197 |
+
###
|
198 |
+
matched_output_SBM = matched_output_SBM.merge(scoresSBM_best[[search_df_key_field, 'index_ref','address_ref',
|
199 |
+
'full_match_score_based', 'Reference file']], on = search_df_key_field, how = "left").\
|
200 |
+
rename(columns={"full_address":"search_orig_address"})
|
201 |
+
|
202 |
+
#ref_search.to_csv("ref_search.csv")
|
203 |
+
|
204 |
+
if 'index' not in ref_search.columns:
|
205 |
+
ref_search['ref_index'] = ref_search.index
|
206 |
+
|
207 |
+
matched_output_SBM = matched_output_SBM.merge(ref_search.drop_duplicates("fulladdress")[["ref_index", "fulladdress", "Postcode", "property_number", "prop_number", "flat_number", "apart_number", "block_number", 'unit_number', "room_number", "house_court_name", "ref_address_stand"]], left_on = "address_ref", right_on = "fulladdress", how = "left", suffixes=('_search', '_reference')).rename(columns={"fulladdress":"reference_orig_address", "ref_address_stand":"reference_list_address"})
|
208 |
+
|
209 |
+
#matched_output_SBM.to_csv("matched_output_SBM_earlier_" + str(standardise) + ".csv")
|
210 |
+
|
211 |
+
# To replace with number check
|
212 |
+
|
213 |
+
|
214 |
+
matched_output_SBM = matched_output_SBM.rename(columns={"full_match_score_based":"full_match"})
|
215 |
+
|
216 |
+
matched_output_SBM['property_number_match'] = matched_output_SBM['full_match']
|
217 |
+
#
|
218 |
+
|
219 |
+
scores_SBM_best_cols = [search_df_key_field, 'full_match_score_based', 'perc_weighted_columns_matched', 'address_pred']#, "reference_mod_address"]
|
220 |
+
scores_SBM_best_cols.extend(new_join_col)
|
221 |
+
|
222 |
+
matched_output_SBM_b = scoresSBM_best[scores_SBM_best_cols]
|
223 |
+
|
224 |
+
matched_output_SBM = matched_output_SBM.merge(matched_output_SBM_b.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
|
225 |
+
|
226 |
+
#matched_output_SBM.to_csv("matched_output_SBM_later_" + str(standardise) + ".csv")
|
227 |
+
|
228 |
+
from tools.fuzzy_match import create_diag_shortlist
|
229 |
+
matched_output_SBM = create_diag_shortlist(matched_output_SBM, "search_orig_address", score_cut_off, blocker_column, fuzzy_col='perc_weighted_columns_matched', search_mod_address="address_pred", resolve_tie_breaks=False)
|
230 |
+
|
231 |
+
#matched_output_SBM.to_csv("matched_output_after.csv")
|
232 |
+
|
233 |
+
#matched_output_SBM["UPRN"] = scoresSBM_best['UPRN']
|
234 |
+
|
235 |
+
matched_output_SBM['standardised_address'] = standardise
|
236 |
+
|
237 |
+
matched_output_SBM = matched_output_SBM.rename(columns={"address_pred":"search_mod_address",
|
238 |
+
#"address_ref":"reference_orig_address",
|
239 |
+
#"full_match_score_based":"fuzzy_score_match",
|
240 |
+
'perc_weighted_columns_matched':"fuzzy_score"})
|
241 |
+
|
242 |
+
matched_output_SBM_cols = [search_df_key_field, 'search_orig_address','reference_orig_address',
|
243 |
+
'full_match',
|
244 |
+
'full_number_match',
|
245 |
+
'flat_number_match',
|
246 |
+
'room_number_match',
|
247 |
+
'block_number_match',
|
248 |
+
'property_number_match',
|
249 |
+
'close_postcode_match',
|
250 |
+
'house_court_name_match',
|
251 |
+
'fuzzy_score_match',
|
252 |
+
"fuzzy_score",
|
253 |
+
'property_number_search', 'property_number_reference',
|
254 |
+
'flat_number_search', 'flat_number_reference',
|
255 |
+
'room_number_search', 'room_number_reference',
|
256 |
+
'block_number_search', 'block_number_reference',
|
257 |
+
"unit_number_search","unit_number_reference",
|
258 |
+
'house_court_name_search', 'house_court_name_reference',
|
259 |
+
"search_mod_address", 'reference_mod_address','Postcode', 'postcode', 'ref_index', 'Reference file']
|
260 |
+
|
261 |
+
#matched_output_SBM_cols = [search_df_key_field, 'search_orig_address', 'reference_orig_address',
|
262 |
+
#'full_match', 'fuzzy_score_match', 'property_number_match', 'full_number_match',
|
263 |
+
#'fuzzy_score', 'search_mod_address', 'reference_mod_address', 'Reference file']
|
264 |
+
|
265 |
+
matched_output_SBM_cols.extend(new_join_col)
|
266 |
+
matched_output_SBM_cols.extend(['standardised_address'])
|
267 |
+
matched_output_SBM = matched_output_SBM[matched_output_SBM_cols]
|
268 |
+
|
269 |
+
matched_output_SBM = matched_output_SBM.sort_values(search_df_key_field, ascending=True)
|
270 |
+
|
271 |
+
#matched_output_SBM.to_csv("matched_output_SBM_out.csv")
|
272 |
+
|
273 |
+
return matched_output_SBM
|
274 |
+
|
275 |
+
def score_based_match(predict_df_search, ref_search, orig_search_df, matching_variables, text_columns, blocker_column, weights, fuzzy_method, score_cut_off, search_df_key_field, standardise, new_join_col, score_cut_off_nnet_street=score_cut_off_nnet_street):
|
276 |
+
|
277 |
+
scoresSBM = compute_match(predict_df_search, ref_search, orig_search_df, matching_variables, text_columns, blocker_column, weights, fuzzy_method)
|
278 |
+
|
279 |
+
if scoresSBM.empty:
|
280 |
+
# If no pairs are found, break
|
281 |
+
return pd.DataFrame(), pd.DataFrame()
|
282 |
+
|
283 |
+
scoresSBM_search = calc_final_nnet_scores(scoresSBM, weights, matching_variables)
|
284 |
+
|
285 |
+
# Filter potential matched address scores to those with highest scores only
|
286 |
+
scoresSBM_search_m = scoresSBM_search[scoresSBM_search["score_perc"] == scoresSBM_search["score_perc_max"]]
|
287 |
+
|
288 |
+
scoresSBM_search_m_j = join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search)
|
289 |
+
|
290 |
+
#scoresSBM_search_m_j.to_csv("scoresSBM_search_m_j.csv")
|
291 |
+
|
292 |
+
# When blocking by street, may to have an increased threshold as this is more prone to making mistakes
|
293 |
+
if blocker_column[0] == "Street": scoresSBM_search_m_j['full_match_score_based'] = (scoresSBM_search_m_j['score_perc'] >= score_cut_off_nnet_street)
|
294 |
+
|
295 |
+
else: scoresSBM_search_m_j['full_match_score_based'] = (scoresSBM_search_m_j['score_perc'] >= score_cut_off)
|
296 |
+
|
297 |
+
### Reorder some columns
|
298 |
+
scoresSBM_out, start_columns = rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise)
|
299 |
+
|
300 |
+
#scoresSBM_out.to_csv("scoresSBM_out.csv")
|
301 |
+
|
302 |
+
matched_output_SBM = create_matched_results_nnet(scoresSBM_out, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off)
|
303 |
+
|
304 |
+
matched_output_SBM_best = matched_output_SBM.sort_values([search_df_key_field, "full_match"], ascending = [True, False]).drop_duplicates(search_df_key_field)
|
305 |
+
|
306 |
+
#matched_output_SBM.to_csv("matched_output_SBM.csv")
|
307 |
+
#matched_output_SBM_best.to_csv("matched_output_SBM_best.csv")
|
308 |
+
|
309 |
+
scoresSBM_best = scoresSBM_out[scoresSBM_out[search_df_key_field].isin(matched_output_SBM_best[search_df_key_field])]
|
310 |
+
|
311 |
+
return scoresSBM_best, matched_output_SBM_best
|
312 |
+
|
313 |
+
def check_matches_against_fuzzy(match_results, scoresSBM, search_df_key_field):
|
314 |
+
|
315 |
+
if not match_results.empty:
|
316 |
+
|
317 |
+
if 'fuzz_full_match' not in match_results.columns:
|
318 |
+
match_results['fuzz_full_match'] = False
|
319 |
+
|
320 |
+
match_results = match_results.add_prefix("fuzz_").rename(columns={"fuzz_"+search_df_key_field:search_df_key_field})
|
321 |
+
|
322 |
+
#Merge fuzzy match full matches onto model data
|
323 |
+
|
324 |
+
scoresSBM_m = scoresSBM.merge(match_results.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
|
325 |
+
|
326 |
+
else:
|
327 |
+
scoresSBM_m = scoresSBM
|
328 |
+
scoresSBM_m["fuzz_full_match"] = False
|
329 |
+
scoresSBM_m['fuzz_fuzzy_score_match'] = False
|
330 |
+
scoresSBM_m['fuzz_property_number_match'] = False
|
331 |
+
scoresSBM_m['fuzz_fuzzy_score'] = 0
|
332 |
+
scoresSBM_m['fuzz_reference_orig_address'] = ""
|
333 |
+
|
334 |
+
scoresSBM_t = scoresSBM[scoresSBM["full_match_score_based"]==True]
|
335 |
+
|
336 |
+
### Create a df of matches the model finds that the fuzzy matching work did not
|
337 |
+
|
338 |
+
scoresSBM_m_model_add_matches = scoresSBM_m[(scoresSBM_m["full_match_score_based"] == True) &\
|
339 |
+
(scoresSBM_m["fuzz_full_match"] == False)]
|
340 |
+
|
341 |
+
# Drop some irrelevant columns
|
342 |
+
|
343 |
+
first_cols = ['UPRN', search_df_key_field, 'full_match_score_based', 'fuzz_full_match', 'fuzz_fuzzy_score_match', 'fuzz_property_number_match',\
|
344 |
+
'fuzz_fuzzy_score', 'match_score', 'max_possible_score', 'perc_weighted_columns_matched',\
|
345 |
+
'perc_weighted_columns_matched_max_for_pred_address', 'address_pred',\
|
346 |
+
'address_ref', 'fuzz_reference_orig_address']
|
347 |
+
|
348 |
+
last_cols = [col for col in scoresSBM_m_model_add_matches.columns if col not in first_cols]
|
349 |
+
|
350 |
+
scoresSBM_m_model_add_matches = scoresSBM_m_model_add_matches[first_cols+last_cols].drop(['fuzz_search_mod_address',
|
351 |
+
'fuzz_reference_mod_address', 'fuzz_fulladdress', 'fuzz_UPRN'], axis=1, errors="ignore")
|
352 |
+
|
353 |
+
### Create a df for matches the fuzzy matching found that the neural net model does not
|
354 |
+
|
355 |
+
if not match_results.empty:
|
356 |
+
scoresSBM_t_model_failed = match_results[(~match_results[search_df_key_field].isin(scoresSBM_t[search_df_key_field])) &\
|
357 |
+
(match_results["fuzz_full_match"] == True)]
|
358 |
+
|
359 |
+
scoresSBM_t_model_failed = scoresSBM_t_model_failed.\
|
360 |
+
merge(scoresSBM.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
|
361 |
+
|
362 |
+
scoresSBM_t_model_failed = scoresSBM_t_model_failed[first_cols+last_cols].drop(['fuzz_search_mod_address',
|
363 |
+
'fuzz_reference_mod_address', 'fuzz_fulladdress', 'fuzz_UPRN'], axis=1, errors="ignore")
|
364 |
+
else:
|
365 |
+
scoresSBM_t_model_failed = pd.DataFrame()
|
366 |
+
|
367 |
+
## Join back onto original results file and export
|
368 |
+
|
369 |
+
scoresSBM_new_matches_from_model = scoresSBM_m_model_add_matches.drop_duplicates(search_df_key_field)
|
370 |
+
|
371 |
+
if not match_results.empty:
|
372 |
+
match_results_out = match_results.merge(scoresSBM_new_matches_from_model[[search_df_key_field, 'full_match_score_based', 'address_pred',
|
373 |
+
'address_ref']], on = search_df_key_field, how = "left")
|
374 |
+
|
375 |
+
match_results_out.loc[match_results_out['full_match_score_based'].isna(),'full_match_score_based'] = False
|
376 |
+
|
377 |
+
#match_results_out['full_match_score_based'].value_counts()
|
378 |
+
|
379 |
+
match_results_out["full_match_fuzzy_or_score_based"] = (match_results_out["fuzz_full_match"] == True) |\
|
380 |
+
(match_results_out["full_match_score_based"] == True)
|
381 |
+
else: match_results_out = match_results
|
382 |
+
|
383 |
+
return scoresSBM_m_model_add_matches, scoresSBM_t_model_failed, match_results_out
|
384 |
+
|
tools/standardise.py
ADDED
@@ -0,0 +1,722 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
from typing import Type, Dict, List, Tuple
|
5 |
+
from datetime import datetime
|
6 |
+
import warnings
|
7 |
+
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
|
8 |
+
|
9 |
+
PandasDataFrame = Type[pd.DataFrame]
|
10 |
+
PandasSeries = Type[pd.Series]
|
11 |
+
MatchedResults = Dict[str,Tuple[str,int]]
|
12 |
+
array = List[str]
|
13 |
+
|
14 |
+
today = datetime.now().strftime("%d%m%Y")
|
15 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
16 |
+
|
17 |
+
# # Standardisation functions
|
18 |
+
|
19 |
+
def standardise_wrapper_func(search_df_cleaned:PandasDataFrame, ref_df_cleaned:PandasDataFrame,\
|
20 |
+
standardise = False, filter_to_lambeth_pcodes = True, match_task = "fuzzy"):
|
21 |
+
'''
|
22 |
+
Initial standardisation of search and reference dataframes before passing addresses and postcodes to the main standardisation function
|
23 |
+
'''
|
24 |
+
|
25 |
+
## Search df - lower case addresses, replace spaces in postcode
|
26 |
+
search_df_cleaned["full_address_search"] = search_df_cleaned["full_address"].str.lower().str.strip()
|
27 |
+
search_df_cleaned['postcode_search'] = search_df_cleaned['postcode'].str.lower().str.strip().str.replace("\s+", "", regex=True)
|
28 |
+
|
29 |
+
# Filter out records where 'Excluded from search' is not a postal address by making the postcode blank
|
30 |
+
search_df_cleaned.loc[search_df_cleaned['Excluded from search'] == "Excluded - non-postal address", 'postcode_search'] = ""
|
31 |
+
|
32 |
+
# Remove nulls from ref_df postcode
|
33 |
+
ref_df_cleaned = ref_df_cleaned[ref_df_cleaned['Postcode'].notna()]
|
34 |
+
|
35 |
+
ref_df_cleaned["full_address_search"] = ref_df_cleaned["fulladdress"].str.lower().str.strip()
|
36 |
+
ref_df_cleaned['postcode_search'] = ref_df_cleaned['Postcode'].str.lower().str.strip().str.replace("\s+", "", regex=True)
|
37 |
+
|
38 |
+
# Block only on first 5 characters of postcode string - Doesn't give more matches and makes everything a bit slower
|
39 |
+
#search_df_cleaned['postcode_search'] = search_df_cleaned['postcode_search'].str[:-1]
|
40 |
+
#ref_df_cleaned['postcode_search'] = ref_df_cleaned['postcode_search'].str[:-1]
|
41 |
+
|
42 |
+
|
43 |
+
### Use standardise function
|
44 |
+
|
45 |
+
### Remove 'non-housing' places from the list - not included as want to check all
|
46 |
+
#search_df_after_stand = remove_non_housing(search_df_cleaned, 'full_address_search')
|
47 |
+
search_df_after_stand = standardise_address(search_df_cleaned, "full_address_search", "search_address_stand", standardise = standardise, out_london = True)
|
48 |
+
|
49 |
+
## Standardise ref_df addresses
|
50 |
+
|
51 |
+
if match_task == "fuzzy":
|
52 |
+
ref_df_after_stand = standardise_address(ref_df_cleaned, "full_address_search", "ref_address_stand", standardise = standardise, out_london = True)
|
53 |
+
else:
|
54 |
+
# For the neural net matching, I didn't find that standardising the reference addresses helped at all, in fact it made things worse. So reference addresses are not standardised at this step.
|
55 |
+
ref_df_after_stand = standardise_address(ref_df_cleaned, "full_address_search", "ref_address_stand", standardise = False, out_london = True)
|
56 |
+
|
57 |
+
|
58 |
+
return search_df_after_stand, ref_df_after_stand#, search_df_after_stand_series, ref_df_after_stand_series
|
59 |
+
|
60 |
+
def standardise_address(df:PandasDataFrame, col:str, out_col:str, standardise:bool = True, out_london = True) -> PandasDataFrame:
|
61 |
+
|
62 |
+
'''
|
63 |
+
This function takes a 'full address' column and then standardises so that extraneous
|
64 |
+
information is removed (i.e. postcodes & London, as this algorithm is used for London
|
65 |
+
addresses only), and so that room/flat/property numbers can be accurately extracted. The
|
66 |
+
standardised addresses can then be used for the fuzzy matching functions later in this
|
67 |
+
notebook.
|
68 |
+
|
69 |
+
The function does the following:
|
70 |
+
|
71 |
+
- Removes the post code and 'london' (if not dealing with addresses outside of london)
|
72 |
+
from the address to reduce the text the algorithm has to search.
|
73 |
+
Postcode removal uses regex to extract a UK postcode.
|
74 |
+
|
75 |
+
- Remove the word 'flat' or 'apartment' from an address that has only one number in it
|
76 |
+
|
77 |
+
- Add 'flat' to the start of any address that contains 'house' or 'court' (which are generally housing association buildings)
|
78 |
+
This is because in the housing list, these addresses never have the word flat in front of them
|
79 |
+
|
80 |
+
- Replace any addresses that don't have a space between the comma and the next word or double spaces
|
81 |
+
|
82 |
+
- Replace 'number / number' and 'number-number' with 'number' (the first number in pair)
|
83 |
+
|
84 |
+
- Add 'flat' to the start of addresses that include ground floor/first floor etc. flat
|
85 |
+
in the text. Replace with flat a,b,c etc.
|
86 |
+
|
87 |
+
- Pull out property, flat, and room numbers from the address text
|
88 |
+
|
89 |
+
- Return the data frame with the new columns included
|
90 |
+
|
91 |
+
'''
|
92 |
+
|
93 |
+
df_copy = df.copy(deep=True)
|
94 |
+
|
95 |
+
# Trim the address to remove leading and tailing spaces
|
96 |
+
df_copy[col] = df_copy[col].str.strip()
|
97 |
+
|
98 |
+
''' Remove the post code and 'london' from the address to reduce the text the algorithm has to search
|
99 |
+
Using a regex to extract a UK postcode. I got the regex from the following. Need to replace their \b in the solution with \\b
|
100 |
+
https://stackoverflow.com/questions/51828712/r-regular-expression-for-extracting-uk-postcode-from-an-address-is-not-ordered
|
101 |
+
|
102 |
+
The following will pick up whole postcodes, postcodes with just the first part, and postcodes with the first
|
103 |
+
part and the first number of the second half
|
104 |
+
'''
|
105 |
+
|
106 |
+
|
107 |
+
df_copy['add_no_pcode'] = remove_postcode(df_copy, col)
|
108 |
+
|
109 |
+
if out_london == False:
|
110 |
+
df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.replace("london","").str.replace(r",,|, ,","", regex=True)
|
111 |
+
|
112 |
+
# If the user wants to standardise the address
|
113 |
+
if standardise:
|
114 |
+
|
115 |
+
df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.lower()
|
116 |
+
|
117 |
+
# If there are dates at the start of the address, change this
|
118 |
+
df_copy['add_no_pcode'] = replace_mistaken_dates(df_copy, 'add_no_pcode')
|
119 |
+
|
120 |
+
# Replace flat name variations with flat, abbreviations with full name of item (e.g. rd to road)
|
121 |
+
df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.replace(r"\brd\b","road", regex=True).\
|
122 |
+
str.replace(r"\bst\b","street", regex=True).\
|
123 |
+
str.replace(r"\bave\b","avenue", regex=True).\
|
124 |
+
str.replace("'", "", regex=False).\
|
125 |
+
str.replace(r"\bat\b ", " ",regex=True).\
|
126 |
+
str.replace("apartment", "flat",regex=False).\
|
127 |
+
str.replace("studio flat", "flat",regex=False).\
|
128 |
+
str.replace("cluster flat", "flats",regex=False).\
|
129 |
+
str.replace(r"\bflr\b", "floor", regex=True).\
|
130 |
+
str.replace(r"\bflrs\b", "floors", regex=True).\
|
131 |
+
str.replace(r"\blwr\b", "lower", regex=True).\
|
132 |
+
str.replace(r"\bgnd\b", "ground", regex=True).\
|
133 |
+
str.replace(r"\blgnd\b", "lower ground", regex=True).\
|
134 |
+
str.replace(r"\bgrd\b", "ground", regex=True).\
|
135 |
+
str.replace(r"\bmais\b", "flat", regex=True).\
|
136 |
+
str.replace(r"\bmaisonette\b", "flat", regex=True).\
|
137 |
+
str.replace(r"\bpt\b", "penthouse", regex=True).\
|
138 |
+
str.replace(r"\bbst\b","basement", regex=True).\
|
139 |
+
str.replace(r"\bbsmt\b","basement", regex=True)
|
140 |
+
|
141 |
+
df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
|
142 |
+
|
143 |
+
# Replace any addresses that don't have a space between the comma and the next word. and double spaces # df_copy['add_no_pcode_house']
|
144 |
+
df_copy['add_no_pcode_house_comma'] = df_copy['add_no_pcode_house'].str.replace(r',(\w)', r', \1', regex=True).str.replace(' ', ' ', regex=False)
|
145 |
+
|
146 |
+
# Replace number / number and number-number with number
|
147 |
+
df_copy['add_no_pcode_house_comma_no'] = df_copy['add_no_pcode_house_comma'].str.replace(r'(\d+)\/(\d+)', r'\1', regex=True\
|
148 |
+
).str.replace(r'(\d+)-(\d+)', r'\1', regex=True\
|
149 |
+
).str.replace(r'(\d+) - (\d+)', r'\1', regex=True)
|
150 |
+
|
151 |
+
# Add 'flat' to the start of addresses that include ground/first/second etc. floor flat in the text
|
152 |
+
df_copy['floor_replacement'] = replace_floor_flat(df_copy, 'add_no_pcode_house_comma_no')
|
153 |
+
df_copy['flat_added_to_start_addresses_begin_letter'] = add_flat_addresses_start_with_letter(df_copy, 'floor_replacement')
|
154 |
+
|
155 |
+
df_copy[out_col] = merge_series(df_copy['add_no_pcode_house_comma_no'], df_copy['flat_added_to_start_addresses_begin_letter'])
|
156 |
+
|
157 |
+
# Write stuff back to the original df
|
158 |
+
df[out_col] = df_copy[out_col]
|
159 |
+
|
160 |
+
else:
|
161 |
+
df_copy[out_col] = df_copy['add_no_pcode']
|
162 |
+
df[out_col] = df_copy['add_no_pcode']
|
163 |
+
|
164 |
+
## POST STANDARDISATION CLEANING AND INFORMATION EXTRACTION
|
165 |
+
# Remove trailing spaces
|
166 |
+
df[out_col] = df[out_col].str.strip()
|
167 |
+
|
168 |
+
# Pull out property, flat, and room numbers from the address text
|
169 |
+
df['property_number'] = extract_prop_no(df_copy, out_col)
|
170 |
+
|
171 |
+
# Extract flat, apartment numbers
|
172 |
+
df = extract_flat_and_other_no(df, out_col)
|
173 |
+
|
174 |
+
df['flat_number'] = merge_series(df['flat_number'], df['apart_number'])
|
175 |
+
df['flat_number'] = merge_series(df['flat_number'], df['prop_number'])
|
176 |
+
df['flat_number'] = merge_series(df['flat_number'], df['first_sec_number'])
|
177 |
+
df['flat_number'] = merge_series(df['flat_number'], df['first_letter_flat_number'])
|
178 |
+
df['flat_number'] = merge_series(df['flat_number'], df['first_letter_no_more_numbers'])
|
179 |
+
|
180 |
+
# Extract room numbers
|
181 |
+
df['room_number'] = extract_room_no(df, out_col)
|
182 |
+
|
183 |
+
# Extract block and unit names
|
184 |
+
df = extract_block_and_unit_name(df, out_col)
|
185 |
+
|
186 |
+
# Extract house or court name
|
187 |
+
df['house_court_name'] = extract_house_or_court_name(df, out_col)
|
188 |
+
|
189 |
+
return df
|
190 |
+
|
191 |
+
def move_flat_house_court(df:PandasDataFrame):
|
192 |
+
''' Remove 'flat' from any address that contains 'house' or 'court'
|
193 |
+
From the df address, remove the word 'flat' from any address that contains the word 'house' or 'court'
|
194 |
+
This is because in the housing list, these addresses never have the word flat in front of them
|
195 |
+
'''
|
196 |
+
|
197 |
+
# Remove the word flat or apartment from addresses that have only one number in it. 'Flat' will be re-added later to relevant addresses
|
198 |
+
# that need it (replace_floor_flat)
|
199 |
+
df['flat_removed'] = remove_flat_one_number_address(df, 'add_no_pcode')
|
200 |
+
|
201 |
+
|
202 |
+
|
203 |
+
remove_flat_house = df['flat_removed'].str.lower().str.contains(r"\bhouse\b")#(?=\bhouse\b)(?!.*house road)")
|
204 |
+
remove_flat_court = df['flat_removed'].str.lower().str.contains(r"\bcourt\b")#(?=\bcourt\b)(?!.*court road)")
|
205 |
+
remove_flat_terrace = df['flat_removed'].str.lower().str.contains(r"\bterrace\b")#(?=\bterrace\b)(?!.*terrace road)")
|
206 |
+
remove_flat_house_or_court = (remove_flat_house | remove_flat_court | remove_flat_terrace == 1)
|
207 |
+
|
208 |
+
df['remove_flat_house_or_court'] = remove_flat_house_or_court
|
209 |
+
|
210 |
+
# Assuming 'df' is your DataFrame
|
211 |
+
df = df[~df.index.duplicated(keep='first')]
|
212 |
+
|
213 |
+
df['house_court_replacement'] = "flat " + df.loc[df['remove_flat_house_or_court'] == True, 'flat_removed'].str.replace(r"\bflat\b","", regex=True).str.strip().map(str)
|
214 |
+
|
215 |
+
#df["add_no_pcode_house"] = merge_columns(df, "add_no_pcode_house", 'flat_removed', "house_court_replacement")
|
216 |
+
|
217 |
+
#merge_columns(df, "new_col", col1, 'letter_after_number')
|
218 |
+
df["add_no_pcode_house"] = merge_series(df['flat_removed'], df["house_court_replacement"])
|
219 |
+
|
220 |
+
return df["add_no_pcode_house"]
|
221 |
+
|
222 |
+
def extract_street_name(address:str) -> str:
|
223 |
+
"""
|
224 |
+
Extracts the street name from the given address.
|
225 |
+
|
226 |
+
Args:
|
227 |
+
address (str): The input address string.
|
228 |
+
|
229 |
+
Returns:
|
230 |
+
str: The extracted street name, or an empty string if no match is found.
|
231 |
+
|
232 |
+
Examples:
|
233 |
+
>>> address1 = "1 Ash Park Road SE54 3HB"
|
234 |
+
>>> extract_street_name(address1)
|
235 |
+
'Ash Park Road'
|
236 |
+
|
237 |
+
>>> address2 = "Flat 14 1 Ash Park Road SE54 3HB"
|
238 |
+
>>> extract_street_name(address2)
|
239 |
+
'Ash Park Road'
|
240 |
+
|
241 |
+
>>> address3 = "123 Main Blvd"
|
242 |
+
>>> extract_street_name(address3)
|
243 |
+
'Main Blvd'
|
244 |
+
|
245 |
+
>>> address4 = "456 Maple AvEnUe"
|
246 |
+
>>> extract_street_name(address4)
|
247 |
+
'Maple AvEnUe'
|
248 |
+
|
249 |
+
>>> address5 = "789 Oak Street"
|
250 |
+
>>> extract_street_name(address5)
|
251 |
+
'Oak Street'
|
252 |
+
"""
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
street_types = [
|
257 |
+
'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
|
258 |
+
'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
|
259 |
+
'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
|
260 |
+
'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
|
261 |
+
'Alley', 'Arcade','Avenue', 'Ave','Bay','Bend','Brae','Byway','Close','Corner','Cove',
|
262 |
+
'Crescent', 'Cres','Cul-de-sac','Dell','Drive', 'Dr','Esplanade','Glen','Green','Grove','Heights', 'Hts',
|
263 |
+
'Mews','Parade','Path','Piazza','Promenade','Quay','Ridge','Row','Terrace', 'Ter','Track','Trail','View','Villas',
|
264 |
+
'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
|
265 |
+
]
|
266 |
+
|
267 |
+
# Dynamically construct the regex pattern with all possible street types
|
268 |
+
street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
|
269 |
+
|
270 |
+
# The overall regex pattern to capture the street name
|
271 |
+
pattern = rf'(?:\d+\s+|\w+\s+\d+\s+|.*\d+[a-z]+\s+|.*\d+\s+)*(?P<street_name>[\w\s]+(?:{street_types_pattern}))'
|
272 |
+
|
273 |
+
def replace_postcode(address):
|
274 |
+
pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$'
|
275 |
+
return re.sub(pattern, "", address)
|
276 |
+
|
277 |
+
|
278 |
+
modified_address = replace_postcode(address.upper())
|
279 |
+
#print(modified_address)
|
280 |
+
#print(address)
|
281 |
+
|
282 |
+
# Perform a case-insensitive search
|
283 |
+
match = re.search(pattern, modified_address, re.IGNORECASE)
|
284 |
+
|
285 |
+
if match:
|
286 |
+
street_name = match.group('street_name')
|
287 |
+
return street_name.strip()
|
288 |
+
else:
|
289 |
+
return ""
|
290 |
+
|
291 |
+
def remove_flat_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
292 |
+
|
293 |
+
'''
|
294 |
+
If there is only one number in the address, and there is no letter after the number,
|
295 |
+
remove the word flat from the address
|
296 |
+
'''
|
297 |
+
|
298 |
+
df['contains_letter_after_number'] = df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)", regex = True)
|
299 |
+
df['contains_single_letter_before_number'] = df[col1].str.lower().str.contains(r'\b[A-Za-z]\b[^\d]* \d', regex = True)
|
300 |
+
df['two_numbers_in_address'] = df[col1].str.lower().str.contains(r"(?:\d+.*?)[^a-zA-Z0-9_].*?\d+", regex = True)
|
301 |
+
df['contains_apartment'] = df[col1].str.lower().str.contains(r"\bapartment\b \w+|\bapartments\b \w+", "", regex = True)
|
302 |
+
df['contains_flat'] = df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+", "", regex = True)
|
303 |
+
df['contains_room'] = df[col1].str.lower().str.contains(r"\broom\b \w+|\brooms\b \w+", "", regex = True)
|
304 |
+
|
305 |
+
|
306 |
+
df['selected_rows'] = (df['contains_letter_after_number'] == False) &\
|
307 |
+
(df['two_numbers_in_address'] == False) &\
|
308 |
+
(df['contains_single_letter_before_number'] == False) &\
|
309 |
+
((df['contains_flat'] == True) |\
|
310 |
+
(df['contains_apartment'] == True) |\
|
311 |
+
(df['contains_room'] == True))
|
312 |
+
|
313 |
+
df['one_number_no_flat'] = df[df['selected_rows'] == True][col1]
|
314 |
+
df['one_number_no_flat'] = df['one_number_no_flat'].str.replace(r"(\bapartment\b)|(\bapartments\b)", "", regex=True).str.replace(r"(\bflat\b)|(\bflats\b)", "", regex=True).str.replace(r"(\broom\b)|(\brooms\b)", "", regex=True)
|
315 |
+
|
316 |
+
df["new_col"] = merge_series(df[col1], df["one_number_no_flat"])
|
317 |
+
|
318 |
+
return df['new_col']
|
319 |
+
|
320 |
+
def add_flat_addresses_start_with_letter(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
321 |
+
'''
|
322 |
+
Add the word flat to addresses that start with a letter.
|
323 |
+
'''
|
324 |
+
|
325 |
+
df['contains_single_letter_at_start_before_number'] = df[col1].str.lower().str.contains(r'^\b[A-Za-z]\b[^\d]* \d', regex = True)
|
326 |
+
|
327 |
+
df['selected_rows'] = (df['contains_single_letter_at_start_before_number'] == True)
|
328 |
+
df['flat_added_to_string_start'] = "flat " + df[df['selected_rows'] == True][col1]
|
329 |
+
|
330 |
+
#merge_columns(df, "new_col", col1, 'flat_added_to_string_start')
|
331 |
+
df["new_col"] = merge_series(df[col1], df['flat_added_to_string_start'])
|
332 |
+
|
333 |
+
|
334 |
+
return df['new_col']
|
335 |
+
|
336 |
+
def extract_letter_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
337 |
+
'''
|
338 |
+
This function looks for addresses that have a letter after a number, but ONLY one number
|
339 |
+
in the string, and doesn't already have a flat, apartment, or room number.
|
340 |
+
|
341 |
+
It then extracts this letter and returns this.
|
342 |
+
|
343 |
+
This is for addresses such as '2b sycamore road', changes it to
|
344 |
+
flat b 2 sycamore road so that 'b' is selected as the flat number
|
345 |
+
|
346 |
+
|
347 |
+
'''
|
348 |
+
|
349 |
+
df['contains_no_numbers_without_letter'] = df[col1].str.lower().str.contains(r"^(?:(?!\d+ ).)*$")
|
350 |
+
df['contains_letter_after_number'] = df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)")
|
351 |
+
df['contains_apartment'] = df[col1].str.lower().str.contains(r"\bapartment\b \w+|\bapartments\b \w+", "")
|
352 |
+
df['contains_flat'] = df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+", "")
|
353 |
+
df['contains_room'] = df[col1].str.lower().str.contains(r"\broom\b \w+|\brooms\b \w+", "")
|
354 |
+
|
355 |
+
df['selected_rows'] = (df['contains_no_numbers_without_letter'] == True) &\
|
356 |
+
(df['contains_letter_after_number'] == True) &\
|
357 |
+
(df['contains_flat'] == False) &\
|
358 |
+
(df['contains_apartment'] == False) &\
|
359 |
+
(df['contains_room'] == False)
|
360 |
+
|
361 |
+
df['extract_letter'] = df[(df['selected_rows'] == True)\
|
362 |
+
][col1].str.extract(r"\d+([a-z]|[A-Z])")
|
363 |
+
|
364 |
+
df['extract_number'] = df[(df['selected_rows'] == True)\
|
365 |
+
][col1].str.extract(r"(\d+)[a-z]|[A-Z]")
|
366 |
+
|
367 |
+
|
368 |
+
df['letter_after_number'] = "flat " +\
|
369 |
+
df[(df['selected_rows'] == True)\
|
370 |
+
]['extract_letter'] +\
|
371 |
+
" " +\
|
372 |
+
df[(df['selected_rows'] == True)\
|
373 |
+
]['extract_number'] +\
|
374 |
+
" " +\
|
375 |
+
df[(df['selected_rows'])\
|
376 |
+
][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\d+([a-z]|[A-Z])","", regex=True).map(str)
|
377 |
+
|
378 |
+
#merge_columns(df, "new_col", col1, 'letter_after_number')
|
379 |
+
df["new_col"] = merge_series(df[col1], df['letter_after_number'])
|
380 |
+
|
381 |
+
return df['new_col']
|
382 |
+
|
383 |
+
# def extract_letter_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
384 |
+
# '''
|
385 |
+
# This function extracts a letter after a single number in an address, excluding cases with existing flat, apartment, or room numbers.
|
386 |
+
# It transforms addresses like '2b sycamore road' to 'flat b 2 sycamore road' to designate 'b' as the flat number.
|
387 |
+
# '''
|
388 |
+
|
389 |
+
# df['selected_rows'] = (df[col1].str.lower().str.contains(r"^(?:(?!\d+ ).)*$") & \
|
390 |
+
# df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)") & \
|
391 |
+
# ~df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+|\bapartment\b \w+|\bapartments\b \w+|\broom\b \w+|\brooms\b \w+"))
|
392 |
+
|
393 |
+
# df['extract_letter'] = df.loc[df['selected_rows'], col1].str.extract(r"\d+([a-z]|[A-Z])")
|
394 |
+
# df['extract_number'] = df.loc[df['selected_rows'], col1].str.extract(r"(\d+)[a-z]|[A-Z]")
|
395 |
+
|
396 |
+
# df['letter_after_number'] = "flat " + df['extract_letter'] + " " + df['extract_number'] + " " + \
|
397 |
+
# df.loc[df['selected_rows'], col1].str.replace(r"\bflat\b", "", regex=True).str.replace(r"\d+([a-z]|[A-Z])", "", regex=True).map(str)
|
398 |
+
|
399 |
+
# df["new_col"] = df[col1].copy()
|
400 |
+
# df.loc[df['selected_rows'], "new_col"] = df['letter_after_number']
|
401 |
+
|
402 |
+
# return df['new_col']
|
403 |
+
|
404 |
+
def replace_floor_flat(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
405 |
+
''' In references to basement, ground floor, first floor, second floor, and top floor
|
406 |
+
flats, this function moves the word 'flat' to the front of the address. This is so that the
|
407 |
+
following word (e.g. basement, ground floor) is recognised as the flat number in the 'extract_flat_and_other_no' function
|
408 |
+
'''
|
409 |
+
|
410 |
+
df['letter_after_number'] = extract_letter_one_number_address(df, col1)
|
411 |
+
|
412 |
+
|
413 |
+
df['basement'] = "flat basement" + df[df[col1].str.lower().str.contains(r"basement"\
|
414 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement\b","", regex=True).map(str)
|
415 |
+
|
416 |
+
|
417 |
+
df['ground_floor'] = "flat a " + df[df[col1].str.lower().str.contains(r"\bground floor\b"\
|
418 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground floor\b","", regex=True).map(str)
|
419 |
+
|
420 |
+
df['first_floor'] = "flat b " + df[df[col1].str.lower().str.contains(r"\bfirst floor\b"\
|
421 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bfirst floor\b","", regex=True).map(str)
|
422 |
+
|
423 |
+
df['ground_and_first_floor'] = "flat ab " + df[df[col1].str.lower().str.contains(r"\bground and first floor\b"\
|
424 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground and first floor\b","", regex=True).map(str)
|
425 |
+
|
426 |
+
df['basement_ground_and_first_floor'] = "flat basementab " + df[df[col1].str.lower().str.contains(r"\bbasement ground and first floors\b"\
|
427 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement and ground and first floors\b","", regex=True).map(str)
|
428 |
+
|
429 |
+
df['basement_ground_and_first_floor2'] = "flat basementab " + df[df[col1].str.lower().str.contains(r"\bbasement ground and first floors\b"\
|
430 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement ground and first floors\b","", regex=True).map(str)
|
431 |
+
|
432 |
+
df['second_floor'] = "flat c " + df[df[col1].str.lower().str.contains(r"\bsecond floor\b"\
|
433 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bsecond floor\b","", regex=True).map(str)
|
434 |
+
|
435 |
+
df['first_and_second_floor'] = "flat bc " + df[df[col1].str.lower().str.contains(r"\bfirst and second floor\b"\
|
436 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bfirst and second floor\b","", regex=True).map(str)
|
437 |
+
|
438 |
+
df['first1_floor'] = "flat b " + df[df[col1].str.lower().str.contains(r"\b1st floor\b"\
|
439 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b1st floor\b","", regex=True).map(str)
|
440 |
+
|
441 |
+
df['second2_floor'] = "flat c " + df[df[col1].str.lower().str.contains(r"\b2nd floor\b"\
|
442 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b2nd floor\b","", regex=True).map(str)
|
443 |
+
|
444 |
+
df['ground_first_second_floor'] = "flat abc " + df[df[col1].str.lower().str.contains(r"\bground and first and second floor\b"\
|
445 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground and first and second floor\b","", regex=True).map(str)
|
446 |
+
|
447 |
+
df['third_floor'] = "flat d " + df[df[col1].str.lower().str.contains(r"\bthird floor\b"\
|
448 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bthird floor\b","", regex=True).map(str)
|
449 |
+
|
450 |
+
df['third3_floor'] = "flat d " + df[df[col1].str.lower().str.contains(r"\b3rd floor\b"\
|
451 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b3rd floor\b","", regex=True).map(str)
|
452 |
+
|
453 |
+
df['top_floor'] = "flat top " + df[df[col1].str.lower().str.contains(r"\btop floor\b"\
|
454 |
+
)][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\btop floor\b","", regex=True).map(str)
|
455 |
+
|
456 |
+
#merge_columns(df, "new_col", col1, 'letter_after_number')
|
457 |
+
df["new_col"] = merge_series(df[col1], df['letter_after_number'])
|
458 |
+
df["new_col"] = merge_series(df["new_col"], df['basement'])
|
459 |
+
df["new_col"] = merge_series(df["new_col"], df['ground_floor'])
|
460 |
+
df["new_col"] = merge_series(df["new_col"], df['first_floor'])
|
461 |
+
df["new_col"] = merge_series(df["new_col"], df['first1_floor'])
|
462 |
+
df["new_col"] = merge_series(df["new_col"], df['ground_and_first_floor'])
|
463 |
+
df["new_col"] = merge_series(df["new_col"], df['basement_ground_and_first_floor'])
|
464 |
+
df["new_col"] = merge_series(df["new_col"], df['basement_ground_and_first_floor2'])
|
465 |
+
df["new_col"] = merge_series(df["new_col"], df['second_floor'])
|
466 |
+
df["new_col"] = merge_series(df["new_col"], df['second2_floor'])
|
467 |
+
df["new_col"] = merge_series(df["new_col"], df['first_and_second_floor'])
|
468 |
+
df["new_col"] = merge_series(df["new_col"], df['ground_first_second_floor'])
|
469 |
+
df["new_col"] = merge_series(df["new_col"], df['third_floor'])
|
470 |
+
df["new_col"] = merge_series(df["new_col"], df['third3_floor'])
|
471 |
+
df["new_col"] = merge_series(df["new_col"], df['top_floor'])
|
472 |
+
|
473 |
+
return df['new_col']
|
474 |
+
|
475 |
+
# def replace_floor_flat(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
476 |
+
# '''Moves the word 'flat' to the front of addresses with floor references like basement, ground floor, etc.'''
|
477 |
+
|
478 |
+
# floor_mapping = {
|
479 |
+
# 'basement': 'basement',
|
480 |
+
# 'ground floor': 'a',
|
481 |
+
# 'first floor': 'b',
|
482 |
+
# 'ground and first floor': 'ab',
|
483 |
+
# 'basement ground and first floors': 'basementab',
|
484 |
+
# 'second floor': 'c',
|
485 |
+
# 'first and second floor': 'bc',
|
486 |
+
# '1st floor': 'b',
|
487 |
+
# '2nd floor': 'c',
|
488 |
+
# 'ground and first and second floor': 'abc',
|
489 |
+
# 'third floor': 'd',
|
490 |
+
# '3rd floor': 'd',
|
491 |
+
# 'top floor': 'top'
|
492 |
+
# }
|
493 |
+
|
494 |
+
# for key, value in floor_mapping.items():
|
495 |
+
# df[key] = f"flat {value} " + df[df[col1].str.lower().str.contains(fr"\b{key}\b")][col1].str.replace(r"\bflat\b", "", regex=True).str.replace(fr"\b{key}\b", "", regex=True).map(str)
|
496 |
+
|
497 |
+
# df["new_col"] = df[col1].copy()
|
498 |
+
|
499 |
+
# for key in floor_mapping.keys():
|
500 |
+
# df["new_col"] = merge_series(df["new_col"], df[key])
|
501 |
+
|
502 |
+
# return df["new_col"]
|
503 |
+
|
504 |
+
|
505 |
+
def remove_non_housing(df:PandasDataFrame, col1:PandasSeries) -> PandasDataFrame:
|
506 |
+
'''
|
507 |
+
Remove items from the housing list that are not housing. Includes addresses including
|
508 |
+
the text 'parking', 'garage', 'store', 'visitor bay', 'visitors room', and 'bike rack',
|
509 |
+
'yard', 'workshop'
|
510 |
+
'''
|
511 |
+
df_copy = df.copy()[~df[col1].str.lower().str.contains(\
|
512 |
+
r"parking|garage|\bstore\b|\bstores\b|\bvisitor bay\b|visitors room|\bbike rack\b|\byard\b|\bworkshop\b")]
|
513 |
+
|
514 |
+
return df_copy
|
515 |
+
|
516 |
+
def extract_prop_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
517 |
+
'''
|
518 |
+
Extract property number from an address. Remove flat/apartment/room numbers,
|
519 |
+
then extract the last number/number + letter in the string.
|
520 |
+
'''
|
521 |
+
try:
|
522 |
+
prop_no = df[col1].str.replace(r"(^\bapartment\b \w+)|(^\bapartments\b \w+)", "", regex=True\
|
523 |
+
).str.replace(r"(^\bflat\b \w+)|(^\bflats\b \w+)", "", regex=True\
|
524 |
+
).str.replace(r"(^\broom\b \w+)|(^\brooms\b \w+)", "", regex=True\
|
525 |
+
).str.replace(",", "", regex=True\
|
526 |
+
).str.extract(r"(\d+\w+|\d+)(?!.*\d+)") #"(\d+\w+|\d+)(?!.*\d+)"
|
527 |
+
except:
|
528 |
+
prop_no = np.nan
|
529 |
+
|
530 |
+
return prop_no
|
531 |
+
|
532 |
+
def extract_room_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
533 |
+
'''
|
534 |
+
Extract room number from an address. Find rows where the address contains 'room', then extract
|
535 |
+
the next word after 'room' in the string.
|
536 |
+
'''
|
537 |
+
try:
|
538 |
+
room_no = df[df[col1].str.lower().str.contains(r"\broom\b|\brooms\b",regex=True\
|
539 |
+
)][col1].str.replace("no.","").str.extract(r'room. (\w+)',regex=True\
|
540 |
+
).rename(columns = {0:"room_number"})
|
541 |
+
except:
|
542 |
+
room_no = np.nan
|
543 |
+
|
544 |
+
return room_no
|
545 |
+
|
546 |
+
def extract_flat_and_other_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
547 |
+
'''
|
548 |
+
Extract flat number from an address.
|
549 |
+
It looks for letters after a property number IF THERE ARE NO MORE NUMBERS IN THE STRING,
|
550 |
+
the words following the words 'flat' or 'apartment', or
|
551 |
+
the last regex selects all characters in a word containing a digit if there are two numbers in the address
|
552 |
+
# ^\d+([a-z]|[A-Z])
|
553 |
+
'''
|
554 |
+
|
555 |
+
# the regex essentially matches strings that satisfy any of the following conditions:
|
556 |
+
|
557 |
+
# Start with a number followed by a single letter (either lowercase or uppercase) and not followed by any other number.
|
558 |
+
# Contain the word "flat" or "apartment".
|
559 |
+
# Start with a number, followed by any characters that are not alphanumeric (denoted by [^a-zA-Z0-9_]), and then another number.
|
560 |
+
|
561 |
+
replaced_series = df[df[col1].str.lower().str.replace(r"^\bflats\b","flat", regex=True).\
|
562 |
+
str.contains(\
|
563 |
+
r"^\d+([a-z]|[A-Z])(?!.*\d+)|^([a-z] |[A-Z] )(?!.*\d+)|\bflat\b|\bapartment\b|(\d+.*?)[^a-zA-Z0-9_].*?\d+")][col1].str.replace("no.","", regex=True)
|
564 |
+
|
565 |
+
extracted_series = replaced_series.str.extract(r'^\d+([a-z]|[A-Z])(?!.*\d+)')[0]
|
566 |
+
|
567 |
+
extracted_series = extracted_series[~extracted_series.index.duplicated()]
|
568 |
+
df = df[~df.index.duplicated(keep='first')]
|
569 |
+
|
570 |
+
df["prop_number"] = extracted_series
|
571 |
+
|
572 |
+
extracted_series = replaced_series.str.extract(r'(?i)(?:flat|flats) (\w+)')
|
573 |
+
if 1 in extracted_series.columns:
|
574 |
+
df["flat_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
|
575 |
+
else:
|
576 |
+
df["flat_number"] = extracted_series[0]
|
577 |
+
|
578 |
+
extracted_series = replaced_series.str.extract(r'(?i)(?:apartment|apartments) (\w+)')
|
579 |
+
if 1 in extracted_series.columns:
|
580 |
+
df["apart_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
|
581 |
+
else:
|
582 |
+
df["apart_number"] = extracted_series[0]
|
583 |
+
|
584 |
+
df["first_sec_number"] = replaced_series.str.extract(r'(\d+.*?)[^a-zA-Z0-9_].*?\d+')
|
585 |
+
df["first_letter_flat_number"] = replaced_series.str.extract(r'\b([A-Za-z])\b[^\d]* \d')
|
586 |
+
df["first_letter_no_more_numbers"] = replaced_series.str.extract(r'^([a-z] |[A-Z] )(?!.*\d+)')
|
587 |
+
|
588 |
+
return df
|
589 |
+
|
590 |
+
def extract_house_or_court_name(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
591 |
+
'''
|
592 |
+
Extract house or court name. Extended to include estate, buildings, and mansions
|
593 |
+
'''
|
594 |
+
extracted_series = df[col1].str.extract(r"(\w+)\s+(house|court|estate|buildings|mansions)")
|
595 |
+
if 1 in extracted_series.columns:
|
596 |
+
df["house_court_name"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
|
597 |
+
else:
|
598 |
+
df["house_court_name"] = extracted_series[0]
|
599 |
+
|
600 |
+
return df["house_court_name"]
|
601 |
+
|
602 |
+
def extract_block_and_unit_name(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
|
603 |
+
'''
|
604 |
+
Extract house or court name. Extended to include estate, buildings, and mansions
|
605 |
+
'''
|
606 |
+
|
607 |
+
extracted_series = df[col1].str.extract(r'(?i)(?:block|blocks) (\w+)')
|
608 |
+
if 1 in extracted_series.columns:
|
609 |
+
df["block_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
|
610 |
+
else:
|
611 |
+
df["block_number"] = extracted_series[0]
|
612 |
+
|
613 |
+
extracted_series = df[col1].str.extract(r'(?i)(?:unit|units) (\w+)')
|
614 |
+
if 1 in extracted_series.columns:
|
615 |
+
df["unit_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
|
616 |
+
else:
|
617 |
+
df["unit_number"] = extracted_series[0]
|
618 |
+
|
619 |
+
return df
|
620 |
+
|
621 |
+
def extract_postcode(df:PandasDataFrame, col:str) -> PandasSeries:
|
622 |
+
'''
|
623 |
+
Extract a postcode from a string column in a dataframe
|
624 |
+
'''
|
625 |
+
postcode_series = df[col].str.upper().str.extract(pat = \
|
626 |
+
"(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$)")
|
627 |
+
|
628 |
+
return postcode_series
|
629 |
+
|
630 |
+
def remove_postcode(df:PandasDataFrame, col:str) -> PandasSeries:
|
631 |
+
'''
|
632 |
+
Remove a postcode from a string column in a dataframe
|
633 |
+
'''
|
634 |
+
|
635 |
+
|
636 |
+
address_series_no_pcode = df[col].str.upper().str.replace(\
|
637 |
+
"\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","",\
|
638 |
+
regex=True
|
639 |
+
).str.lower()
|
640 |
+
|
641 |
+
return address_series_no_pcode
|
642 |
+
|
643 |
+
# Remove addresses with no numbers in at all - too high a risk of badly assigning an address
|
644 |
+
def check_no_number_addresses(df:PandasDataFrame, in_address_series:PandasSeries) -> PandasSeries:
|
645 |
+
'''
|
646 |
+
Highlight addresses from a pandas df where there are no numbers in the address.
|
647 |
+
'''
|
648 |
+
df["in_address_series_temp"] = df[in_address_series].str.lower()
|
649 |
+
|
650 |
+
no_numbers_series = df["in_address_series_temp"].str.contains("^(?!.*\d+).*$", regex=True)
|
651 |
+
|
652 |
+
df.loc[no_numbers_series == True, 'Excluded from search'] = "Excluded - no numbers in address"
|
653 |
+
|
654 |
+
df = df.drop("in_address_series_temp", axis = 1)
|
655 |
+
|
656 |
+
print(df[["full_address", "Excluded from search"]])
|
657 |
+
|
658 |
+
return df
|
659 |
+
|
660 |
+
# Exclude non-postal addresses
|
661 |
+
def remove_non_postal(df, in_address_series):
|
662 |
+
'''
|
663 |
+
Highlight non-postal addresses from a pandas df where a string series that contain specific substrings
|
664 |
+
indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
|
665 |
+
'''
|
666 |
+
df["in_address_series_temp"] = df[in_address_series].str.lower()
|
667 |
+
|
668 |
+
garage_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bgarage\\b|\\bgarages\\b)", regex=True)
|
669 |
+
parking_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bparking\\b)", regex=True)
|
670 |
+
shed_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bshed\\b|\\bsheds\\b)", regex=True)
|
671 |
+
bike_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbike\\b|\\bbikes\\b)", regex=True)
|
672 |
+
bicycle_store_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbicycle store\\b|\\bbicycle store\\b)", regex=True)
|
673 |
+
|
674 |
+
non_postal_series = (garage_address_series | parking_address_series | shed_address_series | bike_address_series | bicycle_store_address_series)
|
675 |
+
|
676 |
+
df.loc[non_postal_series == True, 'Excluded from search'] = "Excluded - non-postal address"
|
677 |
+
|
678 |
+
df = df.drop("in_address_series_temp", axis = 1)
|
679 |
+
|
680 |
+
return df
|
681 |
+
|
682 |
+
def replace_mistaken_dates(df:PandasDataFrame, col:str) -> PandasSeries:
|
683 |
+
'''
|
684 |
+
Identify addresses that mistakenly have dates in them and replace these dates with number values
|
685 |
+
'''
|
686 |
+
# Regex pattern to identify the date-month format
|
687 |
+
pattern = r'(\d{2})-(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
|
688 |
+
|
689 |
+
# Dictionary to map month abbreviations to numbers
|
690 |
+
month_map = {
|
691 |
+
'jan': '1', 'feb': '2', 'mar': '3', 'apr': '4', 'may': '5', 'jun': '6',
|
692 |
+
'jul': '7', 'aug': '8', 'sep': '9', 'oct': '10', 'nov': '11', 'dec': '12'
|
693 |
+
}
|
694 |
+
|
695 |
+
# Custom replacement function
|
696 |
+
def replace_month(match):
|
697 |
+
day = match.group(1).lstrip('0') # Get the day and remove leading zeros
|
698 |
+
month = month_map[match.group(2)] # Convert month abbreviation to number
|
699 |
+
return f"{day}-{month}"
|
700 |
+
|
701 |
+
# Apply the regex replacement
|
702 |
+
corrected_addresses = df[col].str.replace(pattern, replace_month, regex = True)
|
703 |
+
|
704 |
+
return corrected_addresses
|
705 |
+
|
706 |
+
def merge_series(full_series: pd.Series, partially_filled_series: pd.Series) -> pd.Series:
|
707 |
+
'''
|
708 |
+
Merge two series. The 'full_series' is the series you want to replace values in
|
709 |
+
'partially_filled_series' is the replacer series.
|
710 |
+
'''
|
711 |
+
replacer_series_is_null = partially_filled_series.isnull()
|
712 |
+
|
713 |
+
# Start with full_series values
|
714 |
+
merged_series = full_series.copy()
|
715 |
+
|
716 |
+
# Replace values in merged_series where partially_filled_series is not null
|
717 |
+
merged_series[~replacer_series_is_null] = partially_filled_series.dropna()
|
718 |
+
|
719 |
+
return merged_series
|
720 |
+
|
721 |
+
def clean_cols(col:str) -> str:
|
722 |
+
return col.lower().strip().replace(r" ", "_").strip()
|