seanpedrickcase commited on
Commit
dd1cbb4
0 Parent(s):

Initial commit v1.0

Browse files
.dockerignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.ipynb
2
+ *checkpoint.py
3
+ *.pyc
4
+ *.csv
5
+ *.parquet
6
+ *.pem
7
+ *.pkl
8
+ *.env
9
+ *.zip
10
+ test/*
11
+ nnet_model/*
12
+ deprecated_models/*
13
+ .ipynb_checkpoints/*
14
+ orchestration/*
15
+ .vscode/*
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ .zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/[email protected]
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://seanpedrickcase:[email protected]/spaces/seanpedrickcase/address_matcher main
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.ipynb
2
+ *checkpoint.py
3
+ *.pyc
4
+ *.csv
5
+ *.parquet
6
+ *.pem
7
+ *.pkl
8
+ *.env
9
+ *.zip
10
+ test/*
11
+ deprecated_models/*
12
+ .ipynb_checkpoints/*
13
+ orchestration/*
14
+ .vscode/*
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM public.ecr.aws/docker/library/python:3.11.8-slim-bookworm
2
+ # FROM public.ecr.aws/docker/library/python:3.10.13-slim
3
+
4
+ WORKDIR /src
5
+
6
+ COPY requirements.txt .
7
+
8
+ RUN pip install -r requirements.txt
9
+
10
+ # Set up a new user named "user" with user ID 1000
11
+ #RUN useradd -m -u 1000 user
12
+
13
+ # Change ownership of /home/user directory
14
+ #RUN chown -R user:user /home/user
15
+
16
+ # Create the temp files directory and set its permissions
17
+ #RUN mkdir -p /home/user/tmp && chown -R user:user /home/user/tmp
18
+
19
+ # Switch to the "user" user
20
+ #USER user
21
+
22
+ # Set home to the user's home directory
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH \
25
+ PYTHONPATH=$HOME/app \
26
+ PYTHONUNBUFFERED=1 \
27
+ GRADIO_ALLOW_FLAGGING=never \
28
+ GRADIO_NUM_PORTS=1 \
29
+ GRADIO_SERVER_NAME=0.0.0.0 \
30
+ GRADIO_SERVER_PORT=7861 \
31
+ GRADIO_THEME=huggingface \
32
+ #GRADIO_TEMP_DIR=$HOME/tmp \
33
+ #GRADIO_ROOT_PATH=/address-match \
34
+ SYSTEM=spaces
35
+
36
+ # Set the working directory to the user's home directory
37
+ WORKDIR $HOME/app
38
+
39
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
40
+ #COPY --chown=user . $HOME/app
41
+ COPY . $HOME/app
42
+
43
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Address Matching
3
+ emoji: 🌍
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.20.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Introduction
16
+ Fuzzy matching a dataset with an LLPG dataset in the LPI format (with columns SaoText, SaoStartNumber etc.). Address columns are concatenated together to form a single string address. Important details are extracted by regex (e.g. flat, house numbers, postcodes). Addresses may be 'standardised' in a number of ways; e.g. variations of words used for 'ground floor' such as 'grd' or 'grnd' are replaced with 'ground floor' to give a more consistent address wording. This has been found to increase match rates.
17
+
18
+ Then the two datasets are compared with fuzzy matching. The closest fuzzy matches are selected, and then a post hoc test compares flat/property numbers to ensure a 'full match'.
19
+
20
+ If the LLPG reference file is in the standard LPI format, the neural net model should then initialise. This will break down the addresses to match into a list of sub address fields in the LLPG LPI format. It will then do exact or fuzzy comparisons of each address to the LLPG dataset to find closest matches. The neural net is capable of blocking on postcode and on street name, which is where most of the new matches are found according to testing.
21
+
22
+ The final files will appear in the relevant output boxes, which you can download.
23
+
24
+
app.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load in packages, variables for fuzzy matching
2
+ import os
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ import time
6
+ import copy
7
+ import gradio as gr
8
+ import re
9
+ #import polars as pl
10
+
11
+ from tools.constants import *
12
+ from tools.matcher_funcs import load_matcher_data, run_match_batch, combine_two_matches, create_match_summary
13
+ from tools.gradio import initial_data_load
14
+ from tools.aws_functions import load_data_from_aws
15
+ from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, remove_non_postal, check_no_number_addresses
16
+ from tools.standardise import standardise_wrapper_func
17
+
18
+ import warnings
19
+ warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
20
+ warnings.filterwarnings("ignore", 'Downcasting behavior')
21
+ warnings.filterwarnings("ignore", 'A value is trying to be set on a copy of a slice from a DataFrame')
22
+ warnings.filterwarnings("ignore")
23
+
24
+
25
+ today = datetime.now().strftime("%d%m%Y")
26
+ today_rev = datetime.now().strftime("%Y%m%d")
27
+
28
+ # Base folder is where the code file is stored
29
+ base_folder = Path(os.getcwd())
30
+ input_folder = base_folder/"Input/"
31
+ output_folder = base_folder/"Output/"
32
+ diagnostics_folder = base_folder/"Diagnostics/"
33
+ prep_folder = base_folder/"Helper functions/"
34
+
35
+ def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
36
+ #print("Search df batch size: ", batch_size)
37
+ #print("ref_df df batch size: ", ref_batch_size)
38
+
39
+ total_rows = df.shape[0]
40
+ ref_total_rows = ref_df.shape[0]
41
+
42
+ # Creating bottom and top limits for search data
43
+ search_ranges = []
44
+ for start in range(0, total_rows, batch_size):
45
+ end = min(start + batch_size - 1, total_rows - 1) # Adjusted to get the top limit
46
+ search_ranges.append((start, end))
47
+
48
+ # Creating bottom and top limits for reference data
49
+ ref_ranges = []
50
+ for start in range(0, ref_total_rows, ref_batch_size):
51
+ end = min(start + ref_batch_size - 1, ref_total_rows - 1) # Adjusted to get the top limit
52
+ ref_ranges.append((start, end))
53
+
54
+ # Create DataFrame with combinations of search_range and ref_range
55
+ result_data = []
56
+ for search_range in search_ranges:
57
+ for ref_range in ref_ranges:
58
+ result_data.append((search_range, ref_range))
59
+
60
+ range_df = pd.DataFrame(result_data, columns=['search_range', 'ref_range'])
61
+
62
+ return range_df
63
+
64
+
65
+ def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int, search_postcode_col:str, ref_postcode_col:str):
66
+ '''
67
+ Create batches of address indexes for search and reference dataframes based on shortened postcodes.
68
+ '''
69
+
70
+ # If df sizes are smaller than the batch size limits, no need to run through everything
71
+ if len(df) < batch_size and len(ref_df) < ref_batch_size:
72
+ print("Dataframe sizes are smaller than maximum batch sizes, no need to split data.")
73
+ lengths_df = pd.DataFrame(data={'search_range':[df.index.tolist()], 'ref_range':[ref_df.index.tolist()], 'batch_length':len(df), 'ref_length':len(ref_df)})
74
+ return lengths_df
75
+
76
+ #df.index = df[search_postcode_col]
77
+
78
+ df['index'] = df.index
79
+ ref_df['index'] = ref_df.index
80
+
81
+ # Remove the last character of postcode
82
+ df['postcode_minus_last_character'] = df[search_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
83
+ ref_df['postcode_minus_last_character'] = ref_df[ref_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
84
+
85
+ unique_postcodes = df['postcode_minus_last_character'][df['postcode_minus_last_character'].str.len()>=4].unique().tolist()
86
+
87
+ df = df.set_index('postcode_minus_last_character')
88
+ ref_df = ref_df.set_index('postcode_minus_last_character')
89
+
90
+ df = df.sort_index()
91
+ ref_df = ref_df.sort_index()
92
+
93
+ #df.to_csv("batch_search_df.csv")
94
+
95
+ # Overall batch variables
96
+ batch_indexes = []
97
+ ref_indexes = []
98
+ batch_lengths = []
99
+ ref_lengths = []
100
+
101
+ # Current batch variables for loop
102
+ current_batch = []
103
+ current_ref_batch = []
104
+ current_batch_length = []
105
+ current_ref_length = []
106
+
107
+ unique_postcodes_iterator = unique_postcodes.copy()
108
+
109
+ while unique_postcodes_iterator:
110
+
111
+ unique_postcodes_loop = unique_postcodes_iterator.copy()
112
+
113
+ #print("Current loop postcodes: ", unique_postcodes_loop)
114
+
115
+ for current_postcode in unique_postcodes_loop:
116
+
117
+
118
+
119
+ if len(current_batch) >= batch_size or len(current_ref_batch) >= ref_batch_size:
120
+ print("Batch length reached - breaking")
121
+ break
122
+
123
+ try:
124
+ current_postcode_search_data_add = df.loc[[current_postcode]]#[df['postcode_minus_last_character'].isin(current_postcode)]
125
+ current_postcode_ref_data_add = ref_df.loc[[current_postcode]]#[ref_df['postcode_minus_last_character'].isin(current_postcode)]
126
+
127
+ #print(current_postcode_search_data_add)
128
+
129
+ if not current_postcode_search_data_add.empty:
130
+ current_batch.extend(current_postcode_search_data_add['index'])
131
+
132
+ if not current_postcode_ref_data_add.empty:
133
+ current_ref_batch.extend(current_postcode_ref_data_add['index'])
134
+
135
+ except:
136
+ #print("postcode not found: ", current_postcode)
137
+ pass
138
+
139
+ unique_postcodes_iterator.remove(current_postcode)
140
+
141
+ # Append the batch data to the master lists and reset lists
142
+ batch_indexes.append(current_batch)
143
+ ref_indexes.append(current_ref_batch)
144
+
145
+ current_batch_length = len(current_batch)
146
+ current_ref_length = len(current_ref_batch)
147
+
148
+ batch_lengths.append(current_batch_length)
149
+ ref_lengths.append(current_ref_length)
150
+
151
+ current_batch = []
152
+ current_ref_batch = []
153
+ current_batch_length = []
154
+ current_ref_length = []
155
+
156
+ # Create df to store lengths
157
+ lengths_df = pd.DataFrame(data={'search_range':batch_indexes, 'ref_range':ref_indexes, 'batch_length':batch_lengths, 'ref_length':ref_lengths})
158
+
159
+ return lengths_df
160
+
161
+
162
+ def run_matcher(in_text, in_file, in_ref, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
163
+ '''
164
+ Split search and reference data into batches. Loop and run through the match script.
165
+ '''
166
+
167
+ overall_tic = time.perf_counter()
168
+
169
+ # Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
170
+ InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
171
+
172
+ if InitMatch.search_df.empty or InitMatch.ref_df.empty:
173
+ out_message = "Nothing to match!"
174
+ print(out_message)
175
+ return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
176
+
177
+ # Run initial address preparation and standardisation processes
178
+ # Prepare address format
179
+
180
+ # Polars implementation not yet finalised
181
+ #InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
182
+ #InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
183
+
184
+
185
+ # Prepare all search addresses
186
+ if type(InitMatch.search_df) == str:
187
+ InitMatch.search_df_cleaned, InitMatch.search_df_key_field, InitMatch.search_address_cols = prepare_search_address_string(InitMatch.search_df)
188
+ else:
189
+ InitMatch.search_df_cleaned = prepare_search_address(InitMatch.search_df, InitMatch.search_address_cols, InitMatch.search_postcode_col, InitMatch.search_df_key_field)
190
+
191
+ # Remove addresses that are not postal addresses
192
+ InitMatch.search_df_cleaned = remove_non_postal(InitMatch.search_df_cleaned, "full_address")
193
+
194
+ # Remove addresses that have no numbers in from consideration
195
+ InitMatch.search_df_cleaned = check_no_number_addresses(InitMatch.search_df_cleaned, "full_address")
196
+
197
+ # Initial preparation of reference addresses
198
+ InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
199
+
200
+
201
+ # Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
202
+ #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
203
+ #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
204
+
205
+ # Polars implementation - not finalised
206
+ #InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
207
+ #InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
208
+
209
+ # Standardise addresses
210
+ # Standardise - minimal
211
+
212
+
213
+ tic = time.perf_counter()
214
+ InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
215
+ InitMatch.search_df_cleaned.copy(),
216
+ InitMatch.ref_df_cleaned.copy(),
217
+ standardise = False,
218
+ filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
219
+ match_task="fuzzy") # InitMatch.search_df_after_stand_series, InitMatch.ref_df_after_stand_series
220
+
221
+ toc = time.perf_counter()
222
+ print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
223
+
224
+ # Standardise - full
225
+ tic = time.perf_counter()
226
+ InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
227
+ InitMatch.search_df_cleaned.copy(),
228
+ InitMatch.ref_df_cleaned.copy(),
229
+ standardise = True,
230
+ filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
231
+ match_task="fuzzy") # , InitMatch.search_df_after_stand_series_full_stand, InitMatch.ref_df_after_stand_series_full_stand
232
+
233
+ toc = time.perf_counter()
234
+ print(f"Performed the full standardisation step in {toc - tic:0.1f} seconds")
235
+
236
+ # Determine length of search df to create batches to send through the functions.
237
+ #try:
238
+ range_df = create_batch_ranges(InitMatch.search_df_cleaned.copy(), InitMatch.ref_df_cleaned.copy(), batch_size, ref_batch_size, "postcode", "Postcode")
239
+ #except:
240
+ # range_df = create_simple_batch_ranges(InitMatch.search_df_cleaned, InitMatch.ref_df_cleaned, batch_size, #ref_batch_size)
241
+
242
+ print("Batches to run in this session: ", range_df)
243
+
244
+ OutputMatch = copy.copy(InitMatch)
245
+
246
+ n = 0
247
+ number_of_batches = range_df.shape[0]
248
+
249
+ for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
250
+ print("Running batch ", str(n+1))
251
+
252
+ search_range = range_df.iloc[row]['search_range']
253
+ ref_range = range_df.iloc[row]['ref_range']
254
+
255
+ #print("search_range: ", search_range)
256
+ #pd.DataFrame(search_range).to_csv("search_range.csv")
257
+ #print("ref_range: ", ref_range)
258
+
259
+ BatchMatch = copy.copy(InitMatch)
260
+
261
+ # Subset the search and reference dfs based on current batch ranges
262
+ # BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
263
+ # BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
264
+ # BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
265
+ # BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
266
+ # BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
267
+
268
+
269
+ # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
270
+ # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
271
+ # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
272
+ # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
273
+
274
+ # BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
275
+ # BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
276
+ # BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
277
+ # BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
278
+
279
+ BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
280
+ BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
281
+ BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
282
+
283
+ BatchMatch.ref_df = BatchMatch.ref_df[BatchMatch.ref_df.index.isin(ref_range)].reset_index(drop=True)
284
+ BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned[BatchMatch.ref_df_cleaned.index.isin(ref_range)].reset_index(drop=True)
285
+
286
+ # Dataframes after standardisation process
287
+ BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand[BatchMatch.search_df_after_stand.index.isin(search_range)].reset_index(drop=True)
288
+ BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
289
+
290
+ ### Create lookup lists for fuzzy matches
291
+ # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
292
+ # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
293
+ # BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
294
+ # BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
295
+
296
+ #BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
297
+ #BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
298
+
299
+ BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
300
+ BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
301
+
302
+ # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
303
+ # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
304
+ # BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
305
+ # BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
306
+
307
+ # BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
308
+ # BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
309
+
310
+ # Match the data, unless the search or reference dataframes are empty
311
+ if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
312
+ out_message = "Nothing to match for batch: " + str(n)
313
+ print(out_message)
314
+ BatchMatch_out = BatchMatch
315
+ BatchMatch_out.results_on_orig_df = pd.DataFrame(data={"index":BatchMatch.search_df.index,
316
+ "Excluded from search":False,
317
+ "Matched with reference address":False})
318
+ else:
319
+ summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
320
+
321
+ OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
322
+
323
+ n += 1
324
+
325
+ if in_api==True:
326
+ OutputMatch.results_on_orig_df['Matched with reference address'] = OutputMatch.results_on_orig_df['Matched with reference address'].replace({1:True, 0:False})
327
+ OutputMatch.results_on_orig_df['Excluded from search'] = OutputMatch.results_on_orig_df['Excluded from search'].replace('nan', False).fillna(False)
328
+
329
+ # Remove any duplicates from reference df, prioritise successful matches
330
+ OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
331
+
332
+
333
+ overall_toc = time.perf_counter()
334
+ time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
335
+
336
+ print(OutputMatch.output_summary)
337
+
338
+ if OutputMatch.output_summary == "":
339
+ OutputMatch.output_summary = "No matches were found."
340
+
341
+ fuzzy_not_std_output = OutputMatch.match_results_output.copy()
342
+ fuzzy_not_std_output_mask = ~(fuzzy_not_std_output["match_method"].str.contains("Fuzzy match")) | (fuzzy_not_std_output["standardised_address"] == True)
343
+ fuzzy_not_std_output.loc[fuzzy_not_std_output_mask, "full_match"] = False
344
+ fuzzy_not_std_summary = create_match_summary(fuzzy_not_std_output, "Fuzzy not standardised")
345
+
346
+ fuzzy_std_output = OutputMatch.match_results_output.copy()
347
+ fuzzy_std_output_mask = fuzzy_std_output["match_method"].str.contains("Fuzzy match")
348
+ fuzzy_std_output.loc[fuzzy_std_output_mask == False, "full_match"] = False
349
+ fuzzy_std_summary = create_match_summary(fuzzy_std_output, "Fuzzy standardised")
350
+
351
+ nnet_std_output = OutputMatch.match_results_output.copy()
352
+ nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
353
+
354
+ final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
355
+
356
+ return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
357
+
358
+ # Create the gradio interface
359
+
360
+ block = gr.Blocks(theme = gr.themes.Base())
361
+
362
+ with block:
363
+
364
+ data_state = gr.State(pd.DataFrame())
365
+ ref_data_state = gr.State(pd.DataFrame())
366
+ results_data_state = gr.State(pd.DataFrame())
367
+ ref_results_data_state =gr.State(pd.DataFrame())
368
+
369
+ gr.Markdown(
370
+ """
371
+ # Address matcher
372
+ Match single or multiple addresses to the reference address file of your choice. Fuzzy matching should work on any address columns as long as you specify the postcode column at the end. The neural network component only activates with the in-house neural network model - contact me for details if you have access to AddressBase already.The neural network component works with LLPG files in the LPI format.
373
+
374
+ The tool can accept csv, xlsx (with one sheet), and parquet files. You
375
+ need to specify the address columns of the file to match specifically in the address column area with postcode at the end.
376
+
377
+ Use the 'New Column' button to create a new cell for each column name. After you have chosen a reference file, an address match file, and specified its address columns (plus postcode), you can press 'Match addresses' to run the tool.
378
+ """)
379
+
380
+ with gr.Tab("Match addresses"):
381
+
382
+ with gr.Accordion("I have multiple addresses", open = True):
383
+ in_file = gr.File(label="Input addresses from file", file_count= "multiple")
384
+ in_colnames = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the address. Make sure postcode is at the end")
385
+ in_existing = gr.Dropdown(choices=[], multiselect=False, label="Select columns that indicate existing matches.")
386
+
387
+ with gr.Accordion("I only have a single address", open = False):
388
+ in_text = gr.Textbox(label="Input a single address as text")
389
+
390
+ gr.Markdown(
391
+ """
392
+ ## Choose reference file
393
+ Fuzzy matching will work on any address format, but the neural network will only work with the LLPG LPI format, e.g. with columns SaoText, SaoStartNumber etc.. This joins on the UPRN column. If any of these are different for you,
394
+ open 'Custom reference file format or join columns' below.
395
+ """)
396
+
397
+ in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
398
+
399
+ with gr.Accordion("Use Addressbase API instead of reference file", open = False):
400
+ in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
401
+ in_api_key = gr.Textbox(label="Addressbase API key")
402
+
403
+ with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
404
+ in_refcol = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
405
+ in_joincol = gr.Dropdown(choices=[], multiselect=True, label="Select columns you want to join on to the search dataset")
406
+
407
+ match_btn = gr.Button("Match addresses")
408
+
409
+ with gr.Row():
410
+ output_summary = gr.Textbox(label="Output summary")
411
+ output_file = gr.File(label="Output file")
412
+
413
+ with gr.Tab(label="Advanced options"):
414
+ with gr.Accordion(label = "AWS data access", open = False):
415
+ aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
416
+ with gr.Row():
417
+ in_aws_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth address data example file"])
418
+ load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
419
+
420
+ aws_log_box = gr.Textbox(label="AWS data load status")
421
+
422
+
423
+ ### Loading AWS data ###
424
+ load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_ref, aws_log_box])
425
+
426
+
427
+ # Updates to components
428
+ in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_colnames, in_existing, data_state, results_data_state])
429
+ in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
430
+
431
+ match_btn.click(fn = run_matcher, inputs=[in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, in_api, in_api_key],
432
+ outputs=[output_summary, output_file], api_name="address")
433
+
434
+ # Simple run for HF spaces or local on your computer
435
+ #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
436
+
437
+ # Simple run for AWS server
438
+ block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
439
+
440
+ # Download OpenSSL from here:
441
+ # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
442
+ #block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
443
+ # ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
444
+
445
+ # Running on local server without https
446
+ #block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)
447
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #fuzzywuzzy==0.18.0
2
+ numpy==1.26.2
3
+ pandas==2.2.1
4
+ rapidfuzz==3.8.1
5
+ torch==2.2.1
6
+ recordlinkage==0.16
7
+ pyap==0.3.1
8
+ pytest==7.4.3
9
+ pyarrow==14.0.1
10
+ openpyxl==3.1.2
11
+ gradio==4.20.1
12
+ boto3==1.34.63
13
+ polars==0.20.19
tools/__init__.py ADDED
File without changes
tools/addressbase_api_funcs.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import urllib
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ import time
6
+ import requests
7
+
8
+ today_rev = datetime.now().strftime("%Y%m%d")
9
+
10
+
11
+ # url = 'https://api.os.uk/search/places/v1/uprn?%s'
12
+ # params = urllib.parse.urlencode({'uprn':<UPRN>,'dataset':'LPI', 'key':os.environ["ADDRESSBASE_API_KEY"]})
13
+
14
+ # Places API
15
+ # Technical guide: https://osdatahub.os.uk/docs/places/technicalSpecification
16
+
17
+
18
+ def places_api_query(query, api_key, query_type):
19
+
20
+ def make_api_call(url):
21
+ max_retries = 3
22
+ retries = 0
23
+
24
+ while retries < max_retries:
25
+ try:
26
+ response = requests.get(url)
27
+ if response.status_code == 200:
28
+ # If successful response, return the response
29
+ return response
30
+ elif response.status_code == 429:
31
+ # If rate limited, wait for 5 seconds before retrying
32
+ print("Rate limited. Retrying in 5 seconds...")
33
+ time.sleep(3)
34
+ retries += 1
35
+ else:
36
+ # For other errors, return the response
37
+ return response
38
+ except Exception as e:
39
+ print("Error:", str(e))
40
+ retries += 1
41
+
42
+ # If maximum retries reached, return None
43
+ return None
44
+
45
+ if api_key:
46
+
47
+ overall_tic = time.perf_counter()
48
+
49
+ #filter_code_lsc = "LOGICAL_STATUS_CODE:1"
50
+ filter_code_lpi_lsc ="LPI_LOGICAL_STATUS_CODE:1"
51
+ concat_results = []
52
+
53
+ if query_type == "Address":
54
+ url = 'https://api.os.uk/search/places/v1/find?%s'
55
+ params = urllib.parse.urlencode({'query':query,
56
+ 'dataset':'LPI',
57
+ 'key':api_key,
58
+ "maxresults" : 20,
59
+ 'minmatch':0.70, # This includes partial matches
60
+ 'matchprecision':2,
61
+ 'fq':filter_code_lpi_lsc,
62
+ 'lr':'EN'})
63
+
64
+ try:
65
+ request_text = url % params
66
+ #print(request_text)
67
+ response = make_api_call(request_text)
68
+ except Exception as e:
69
+ print(str(e))
70
+
71
+
72
+ if response is not None:
73
+ if response.status_code == 200:
74
+ # Process the response
75
+ print("Successful response")
76
+ #print("Successful response:", response.json())
77
+ else:
78
+ print("Error:", response.status_code)
79
+
80
+ else:
81
+ print("Maximum retries reached. Error occurred.")
82
+ return pd.DataFrame() # Return blank dataframe
83
+
84
+ # Load JSON response
85
+ response_data = response.json()
86
+
87
+ # Extract 'results' part
88
+ try:
89
+ results = response_data['results']
90
+ concat_results.extend(results)
91
+
92
+ except Exception as e:
93
+ print(str(e))
94
+ return pd.DataFrame() # Return blank dataframe
95
+
96
+ # If querying postcode, need to use pagination and postcode API
97
+ elif query_type == "Postcode":
98
+
99
+ max_results_requested = 100
100
+ remaining_calls = 1
101
+ totalresults = max_results_requested
102
+ call_number = 1
103
+
104
+ while remaining_calls > 0 and call_number <= 10:
105
+
106
+ offset = (call_number-1) * max_results_requested
107
+
108
+ #print("Remaining to query:", remaining_calls)
109
+
110
+
111
+ url = 'https://api.os.uk/search/places/v1/postcode?%s'
112
+ params = urllib.parse.urlencode({'postcode':query,
113
+ 'dataset':'LPI',
114
+ 'key':api_key,
115
+ "maxresults" : max_results_requested,
116
+ 'offset':offset,
117
+ #'fq':filter_code_lsc,
118
+ 'fq':filter_code_lpi_lsc,
119
+ 'lr':'EN'})
120
+
121
+ try:
122
+ request_text = url % params
123
+ #print(request_text)
124
+ response = make_api_call(request_text)
125
+ except Exception as e:
126
+ print(str(e))
127
+
128
+ if response is not None:
129
+ if response.status_code == 200:
130
+ totalresults = response.json()['header']['totalresults']
131
+
132
+ print("Successful response")
133
+ print("Total results:", totalresults)
134
+
135
+ remaining_calls = totalresults - (max_results_requested * call_number)
136
+
137
+ call_number += 1
138
+
139
+ # Concat results together
140
+ try:
141
+ results = response.json()['results']
142
+ concat_results.extend(results)
143
+ except Exception as e:
144
+ print("Result concat failed with error: ", str(e))
145
+ concat_results.append({"invalid_request":True, "POSTCODE_LOCATOR": query})
146
+
147
+ else:
148
+ print("Error:", response.status_code, "For postcode: ", query, " With query: ", request_text)
149
+ concat_results.append({"invalid_request":True, "POSTCODE_LOCATOR": query})
150
+ return pd.DataFrame(data={"invalid_request":[True], "POSTCODE_LOCATOR": [query]},index=[0]) # Return blank dataframe
151
+ else:
152
+ print("Maximum retries reached. Error occurred.")
153
+ return pd.DataFrame() # Return blank dataframe
154
+
155
+ else:
156
+ print("No API key provided.")
157
+ return pd.DataFrame() # Return blank dataframe
158
+
159
+ #print('RESPONSE:', concat_results)
160
+
161
+
162
+ # Convert 'results' to DataFrame
163
+
164
+ # Check if 'LPI' sub-branch exists in the JSON response
165
+ #print(concat_results)
166
+
167
+ if 'LPI' in concat_results[-1]:
168
+ #print("LPI in result columns")
169
+ df = pd.json_normalize(concat_results)
170
+ df.rename(columns=lambda x: x.replace('LPI.', ''), inplace=True)
171
+ else:
172
+ # Normalize the entire JSON data if 'LPI' sub-branch doesn't exist
173
+ df = pd.json_normalize(concat_results)
174
+
175
+
176
+ # Ensure df is a DataFrame, even if it has a single row
177
+ if isinstance(df, pd.Series):
178
+ print("This is a series!")
179
+ df = df.to_frame().T # Convert the Series to a DataFrame with a single row
180
+ # if isinstance(df, pd.DataFrame):
181
+ # print("This is a dataframe!")
182
+ # else:
183
+ # print("This is not a dataframe!")
184
+ # return pd.DataFrame() # Return blank dataframe
185
+
186
+
187
+ print(df)
188
+ #print(df.columns)
189
+ #df.to_csv(query + ".csv")
190
+
191
+
192
+
193
+ overall_toc = time.perf_counter()
194
+ time_out = f"The API call took {overall_toc - overall_tic:0.1f} seconds"
195
+ print(time_out)
196
+
197
+ return df
tools/aws_functions.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type
2
+ import pandas as pd
3
+ import boto3
4
+ import tempfile
5
+ import os
6
+
7
+ PandasDataFrame = Type[pd.DataFrame]
8
+
9
+ bucket_name = 'address-matcher-data'
10
+
11
+ try:
12
+ session = boto3.Session(profile_name="default")
13
+ except Exception as e:
14
+ print(e)
15
+
16
+ # sts = session.client("sts")
17
+ # Create a Session with the IAM role ARN
18
+ # aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
19
+ # response = sts.assume_role(
20
+ # RoleArn=aws_role,
21
+ # RoleSessionName="ecs-test-session"
22
+ # )
23
+ # print(response)
24
+
25
+
26
+ def get_assumed_role_info():
27
+ sts = boto3.client('sts')
28
+ response = sts.get_caller_identity()
29
+
30
+ # Extract ARN of the assumed role
31
+ assumed_role_arn = response['Arn']
32
+
33
+ # Extract the name of the assumed role from the ARN
34
+ assumed_role_name = assumed_role_arn.split('/')[-1]
35
+
36
+ return assumed_role_arn, assumed_role_name
37
+
38
+ try:
39
+ assumed_role_arn, assumed_role_name = get_assumed_role_info()
40
+
41
+ print("Assumed Role ARN:", assumed_role_arn)
42
+ print("Assumed Role Name:", assumed_role_name)
43
+ except Exception as e:
44
+ print(e)
45
+
46
+ # Download direct from S3 - requires login credentials
47
+ def download_file_from_s3(bucket_name, key, local_file_path):
48
+
49
+ s3 = boto3.client('s3')
50
+ s3.download_file(bucket_name, key, local_file_path)
51
+ print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
52
+
53
+ #download_file_from_s3(bucket_name, object_key, local_file_loc)
54
+
55
+ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
56
+ """
57
+ Download all files from an S3 folder to a local folder.
58
+ """
59
+ s3 = boto3.client('s3')
60
+
61
+ # List objects in the specified S3 folder
62
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
63
+
64
+ # Download each object
65
+ for obj in response.get('Contents', []):
66
+ # Extract object key and construct local file path
67
+ object_key = obj['Key']
68
+ local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
69
+
70
+ # Create directories if necessary
71
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
72
+
73
+ # Download the object
74
+ try:
75
+ s3.download_file(bucket_name, object_key, local_file_path)
76
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
77
+ except Exception as e:
78
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
79
+
80
+
81
+ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
82
+ """
83
+ Download specific files from an S3 folder to a local folder.
84
+ """
85
+ s3 = boto3.client('s3')
86
+
87
+ print("Trying to download file: ", filenames)
88
+
89
+ if filenames == '*':
90
+ # List all objects in the S3 folder
91
+ print("Trying to download all files in AWS folder: ", s3_folder)
92
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
93
+
94
+ print("Found files in AWS folder: ", response.get('Contents', []))
95
+
96
+ filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
97
+
98
+ print("Found filenames in AWS folder: ", filenames)
99
+
100
+ for filename in filenames:
101
+ object_key = os.path.join(s3_folder, filename)
102
+ local_file_path = os.path.join(local_folder, filename)
103
+
104
+ # Create directories if necessary
105
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
106
+
107
+ # Download the object
108
+ try:
109
+ s3.download_file(bucket_name, object_key, local_file_path)
110
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
111
+ except Exception as e:
112
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
113
+
114
+
115
+
116
+ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
117
+
118
+ temp_dir = tempfile.mkdtemp()
119
+ local_address_stub = temp_dir + '/address-match/'
120
+ files = []
121
+
122
+ if not 'LAMBETH_ADDRESS_PASSWORD' in os.environ:
123
+ out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
124
+ return files, out_message
125
+
126
+ if aws_password:
127
+ if "Lambeth address data example file" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_ADDRESS_PASSWORD']:
128
+
129
+ s3_folder_stub = 'example-data/lambeth-address-data/latest/'
130
+
131
+ local_folder_path = local_address_stub
132
+
133
+ # Check if folder exists
134
+ if not os.path.exists(local_folder_path):
135
+ print(f"Folder {local_folder_path} does not exist! Making folder.")
136
+
137
+ os.mkdir(local_folder_path)
138
+
139
+ # Check if folder is empty
140
+ if len(os.listdir(local_folder_path)) == 0:
141
+ print(f"Folder {local_folder_path} is empty")
142
+ # Download data
143
+ download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
144
+
145
+ print("AWS data downloaded")
146
+
147
+ else:
148
+ print(f"Folder {local_folder_path} is not empty")
149
+
150
+ #files = os.listdir(local_folder_stub)
151
+ #print(files)
152
+
153
+ files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
154
+
155
+ out_message = "Data successfully loaded from AWS"
156
+ print(out_message)
157
+
158
+ else:
159
+ out_message = "Data not loaded from AWS"
160
+ print(out_message)
161
+ else:
162
+ out_message = "No password provided. Please ask the data team for access if you need this."
163
+ print(out_message)
164
+
165
+ return files, out_message
166
+
tools/constants.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import pickle
4
+ import torch
5
+ import zipfile
6
+ from typing import List, Union, Type, Dict
7
+ from pydantic import BaseModel
8
+
9
+ from .pytorch_models import *
10
+
11
+ PandasDataFrame = Type[pd.DataFrame]
12
+ PandasSeries = Type[pd.Series]
13
+
14
+ # +
15
+ ''' Fuzzywuzzy/Rapidfuzz scorer to use. Options are: ratio, partial_ratio, token_sort_ratio, partial_token_sort_ratio,
16
+ token_set_ratio, partial_token_set_ratio, QRatio, UQRatio, WRatio (default), UWRatio
17
+ details here: https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings'''
18
+
19
+ fuzzy_scorer_used = "token_set_ratio"
20
+
21
+ # +
22
+ fuzzy_match_limit = 85
23
+
24
+ fuzzy_search_addr_limit = 20
25
+
26
+ filter_to_lambeth_pcodes= True
27
+ # -
28
+
29
+ standardise = False
30
+
31
+ # +
32
+ if standardise == True:
33
+ std = "_std"
34
+ if standardise == False:
35
+ std = "_not_std"
36
+
37
+ dataset_name = "data" + std
38
+
39
+ suffix_used = dataset_name + "_" + fuzzy_scorer_used
40
+
41
+ # https://stackoverflow.com/questions/59221557/tensorflow-v2-replacement-for-tf-contrib-predictor-from-saved-model
42
+
43
+ ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
44
+ print(ROOT_DIR)
45
+
46
+ # Uncomment these lines for the tensorflow model
47
+ #model_type = "tf"
48
+ #model_stub = "addr_model_out_lon"
49
+ #model_version = "00000001"
50
+ #file_step_suffix = "550" # I add a suffix to output files to be able to separate comparisons of test data from the same model with different steps e.g. '350' indicates a model that has been through 350,000 steps of training
51
+
52
+ # Uncomment these lines for the pytorch model
53
+ model_type = "lstm"
54
+ model_stub = "pytorch/lstm"
55
+ model_version = ""
56
+ file_step_suffix = ""
57
+ data_sample_size = 476887
58
+ N_EPOCHS = 10
59
+ max_predict_len = 12000
60
+
61
+ word_to_index = {}
62
+ cat_to_idx = {}
63
+ vocab = []
64
+ device = "cpu"
65
+
66
+ global labels_list
67
+ labels_list = []
68
+
69
+ model_dir_name = os.path.join(ROOT_DIR, "nnet_model" , model_stub , model_version)
70
+ print(model_dir_name)
71
+
72
+ model_path = os.path.join(model_dir_name, "saved_model.zip")
73
+ print("model path: ")
74
+ print(model_path)
75
+
76
+ if os.path.exists(model_path):
77
+
78
+ os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Better to go without GPU to avoid 'out of memory' issues
79
+ device = "cpu"
80
+
81
+
82
+
83
+ ## The labels_list object defines the structure of the prediction outputs. It must be the same as what the model was originally trained on
84
+
85
+
86
+
87
+ ''' Load pre-trained model '''
88
+
89
+
90
+
91
+ with zipfile.ZipFile(model_path,"r") as zip_ref:
92
+ zip_ref.extractall(model_dir_name)
93
+
94
+ # if model_stub == "addr_model_out_lon":
95
+
96
+ #import tensorflow as tf
97
+
98
+ #tf.config.list_physical_devices('GPU')
99
+
100
+ # # Number of labels in total (+1 for the blank category)
101
+ # n_labels = len(labels_list) + 1
102
+
103
+ # # Allowable characters for the encoded representation
104
+ # vocab = list(string.digits + string.ascii_lowercase + string.punctuation + string.whitespace)
105
+
106
+ # #print("Loading TF model")
107
+
108
+ # exported_model = tf.saved_model.load(model_dir_name)
109
+
110
+ # labels_list = [
111
+ # 'SaoText', # 1
112
+ # 'SaoStartNumber', # 2
113
+ # 'SaoStartSuffix', # 3
114
+ # 'SaoEndNumber', # 4
115
+ # 'SaoEndSuffix', # 5
116
+ # 'PaoText', # 6
117
+ # 'PaoStartNumber', # 7
118
+ # 'PaoStartSuffix', # 8
119
+ # 'PaoEndNumber', # 9
120
+ # 'PaoEndSuffix', # 10
121
+ # 'Street', # 11
122
+ # 'PostTown', # 12
123
+ # 'AdministrativeArea', #13
124
+ # 'Postcode' # 14
125
+ # ]
126
+
127
+ if "pytorch" in model_stub:
128
+
129
+ labels_list = [
130
+ 'SaoText', # 1
131
+ 'SaoStartNumber', # 2
132
+ 'SaoStartSuffix', # 3
133
+ 'SaoEndNumber', # 4
134
+ 'SaoEndSuffix', # 5
135
+ 'PaoText', # 6
136
+ 'PaoStartNumber', # 7
137
+ 'PaoStartSuffix', # 8
138
+ 'PaoEndNumber', # 9
139
+ 'PaoEndSuffix', # 10
140
+ 'Street', # 11
141
+ 'PostTown', # 12
142
+ 'AdministrativeArea', #13
143
+ 'Postcode', # 14
144
+ 'IGNORE'
145
+ ]
146
+
147
+ #labels_list.to_csv("labels_list.csv", index = None)
148
+
149
+ if (model_type == "transformer") | (model_type == "gru") | (model_type == "lstm") :
150
+ # Load vocab and word_to_index
151
+ with open(model_dir_name + "vocab.txt", "r") as f:
152
+ vocab = eval(f.read())
153
+ with open(model_dir_name + "/word_to_index.txt", "r") as f:
154
+ word_to_index = eval(f.read())
155
+ with open(model_dir_name + "/cat_to_idx.txt", "r") as f:
156
+ cat_to_idx = eval(f.read())
157
+
158
+ VOCAB_SIZE = len(word_to_index)
159
+ OUTPUT_DIM = len(cat_to_idx) + 1 # Number of classes/categories
160
+ EMBEDDING_DIM = 48
161
+ DROPOUT = 0.1
162
+ PAD_TOKEN = 0
163
+
164
+
165
+ if model_type == "transformer":
166
+ NHEAD = 4
167
+ NUM_ENCODER_LAYERS = 1
168
+
169
+ exported_model = TransformerClassifier(VOCAB_SIZE, EMBEDDING_DIM, NHEAD, NUM_ENCODER_LAYERS, OUTPUT_DIM, DROPOUT, PAD_TOKEN)
170
+
171
+ elif model_type == "gru":
172
+ N_LAYERS = 3
173
+ HIDDEN_DIM = 128
174
+ exported_model = TextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN)
175
+
176
+ elif model_type == "lstm":
177
+ N_LAYERS = 3
178
+ HIDDEN_DIM = 128
179
+
180
+ exported_model = LSTMTextClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD_TOKEN)
181
+
182
+
183
+ exported_model.load_state_dict(torch.load(model_dir_name + "output_model_" + str(data_sample_size) +\
184
+ "_" + str(N_EPOCHS) + "_" + model_type + ".pth", map_location=torch.device('cpu')))
185
+ exported_model.eval()
186
+
187
+ device='cpu'
188
+ #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
189
+ exported_model.to(device)
190
+
191
+
192
+ else:
193
+ exported_model = [] #tf.keras.models.load_model(model_dir_name, compile=False)
194
+ # Compile the model with a loss function and an optimizer
195
+ #exported_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
196
+
197
+ else: exported_model = []
198
+
199
+ #if exported_model:
200
+ # exported_model = exported_model
201
+ #else: exported_model = []
202
+
203
+
204
+
205
+ # +
206
+ # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
207
+ batch_size = 10000
208
+ ref_batch_size = 150000
209
+
210
+ ### Fuzzy match method
211
+
212
+ ''' https://recordlinkage.readthedocs.io/en/latest/ref_df-compare.html#recordlinkage.compare.String
213
+ The Python Record Linkage Toolkit uses the jellyfish package for the Jaro, Jaro-Winkler, Levenshtein and Damerau- Levenshtein algorithms.
214
+ Options are [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’]
215
+
216
+ Comparison of some of the Jellyfish string comparison methods: https://manpages.debian.org/testing/python-jellyfish-doc/jellyfish.3.en.html '''
217
+
218
+
219
+ fuzzy_method = "jarowinkler"
220
+
221
+ # Required overall match score for all columns to count as a match
222
+ score_cut_off = 98.7 # 97.5
223
+ # I set a higher score cut off for nnet street blocking based on empirical data. Under this match value I was seeing errors. This value was (.99238), but set here to .995 to be maximally stringent. It is set in 'recordlinkage_funcs.py', score_based_match function
224
+ score_cut_off_nnet_street = 99.5 # 99.238
225
+ # If there are no numbers in the address, then the matcher needs to get a perfect score (otherwise too many issues).
226
+ no_number_fuzzy_match_limit = 100
227
+
228
+ # Reference data 'official' column names
229
+ ref_address_cols = ["Organisation", "SaoStartNumber", "SaoStartSuffix", "SaoEndNumber", "SaoEndSuffix",
230
+ "SaoText", "PaoStartNumber", "PaoStartSuffix", "PaoEndNumber",
231
+ "PaoEndSuffix", "PaoText", "Street", "PostTown", "Postcode"]
232
+
233
+ # Create a list of matching variables. Text columns will be fuzzy matched.
234
+ matching_variables = ref_address_cols
235
+ text_columns = ["Organisation", "PaoText", "Street", "PostTown", "Postcode"]
236
+
237
+ # Modify relative importance of columns (weights) for the recordlinkage part of the match. Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important
238
+ Organisation_weight = 0.1 # Organisation weight is very low just to resolve tie breakers for very similar addresses
239
+ PaoStartNumber_weight = 2
240
+ SaoStartNumber_weight = 2
241
+ Street_weight = 2
242
+ PostTown_weight = 0
243
+ Postcode_weight = 0.5
244
+ AdministrativeArea_weight = 0
245
+ # -
246
+
247
+ weight_vals = [1] * len(ref_address_cols)
248
+ weight_keys = ref_address_cols
249
+ weights = {weight_keys[i]: weight_vals[i] for i in range(len(weight_keys))}
250
+
251
+ # +
252
+ # Modify weighting for scores - Town and AdministrativeArea are not very important as we have postcode. Street number and name are important
253
+
254
+ weights["Organisation"] = Organisation_weight
255
+ weights["SaoStartNumber"] = SaoStartNumber_weight
256
+ weights["PaoStartNumber"] = PaoStartNumber_weight
257
+ weights["Street"] = Street_weight
258
+ weights["PostTown"] = PostTown_weight
259
+ weights["Postcode"] = Postcode_weight
260
+
261
+ # Creating Pydantic basemodel class
262
+
263
+
264
+ class MatcherClass(BaseModel):
265
+ # Fuzzy/general attributes
266
+ fuzzy_scorer_used: str
267
+ fuzzy_match_limit: int
268
+ fuzzy_search_addr_limit: int
269
+ filter_to_lambeth_pcodes: bool
270
+ standardise: bool
271
+ suffix_used: str
272
+
273
+ # Neural net attributes
274
+ matching_variables: List[str]
275
+ model_dir_name: str
276
+ file_step_suffix: str
277
+ exported_model: List
278
+
279
+ fuzzy_method: str
280
+ score_cut_off: float
281
+ text_columns: List[str]
282
+ weights: dict
283
+ model_type: str
284
+ labels_list: List[str]
285
+
286
+ # These are variables that are added on later
287
+ # Pytorch optional variables
288
+ word_to_index: dict
289
+ cat_to_idx: dict
290
+ device: str
291
+ vocab: List[str]
292
+
293
+ # Join data
294
+ file_name: str
295
+ ref_name: str
296
+ search_df: pd.DataFrame
297
+ excluded_df: pd.DataFrame
298
+ pre_filter_search_df: pd.DataFrame
299
+ search_address_cols: List[str]
300
+ search_postcode_col: List[str]
301
+ search_df_key_field: str
302
+ ref_df: pd.DataFrame
303
+ ref_pre_filter: pd.DataFrame
304
+ ref_address_cols: List[str]
305
+ new_join_col: List[str]
306
+ #in_joincol_list: List[str]
307
+ existing_match_cols: List[str]
308
+ standard_llpg_format: List[str]
309
+
310
+ # Results attributes
311
+ match_results_output: pd.DataFrame
312
+ predict_df_nnet: pd.DataFrame
313
+
314
+ # Other attributes generated during training
315
+ compare_all_candidates: List[str]
316
+ diag_shortlist: List[str]
317
+ diag_best_match: List[str]
318
+
319
+ results_on_orig_df: pd.DataFrame
320
+
321
+ summary: str
322
+ output_summary: str
323
+ match_outputs_name: str
324
+ results_orig_df_name: str
325
+
326
+ search_df_after_stand: pd.DataFrame
327
+ ref_df_after_stand: pd.DataFrame
328
+ search_df_after_full_stand: pd.DataFrame
329
+ ref_df_after_full_stand: pd.DataFrame
330
+
331
+ search_df_after_stand_series: pd.Series
332
+ ref_df_after_stand_series: pd.Series
333
+ search_df_after_stand_series_full_stand: pd.Series
334
+ ref_df_after_stand_series_full_stand: pd.Series
335
+
336
+
337
+ # Abort flag if the matcher couldn't even get the results of the first match
338
+ abort_flag: bool
339
+
340
+ # This is to allow for Pandas DataFrame types as an argument
341
+ class Config:
342
+ # Allow for custom types such as Pandas DataFrames in the class
343
+ arbitrary_types_allowed = True
344
+ extra = 'allow'
345
+ # Disable protected namespaces to avoid conflicts
346
+ protected_namespaces = ()
347
+
348
+
349
+
350
+ # Creating an instance of MatcherClass
351
+ InitMatch = MatcherClass(
352
+
353
+ # Fuzzy/general attributes
354
+ fuzzy_scorer_used = fuzzy_scorer_used,
355
+ fuzzy_match_limit = fuzzy_match_limit,
356
+ fuzzy_search_addr_limit = fuzzy_search_addr_limit,
357
+ filter_to_lambeth_pcodes = filter_to_lambeth_pcodes,
358
+ standardise = standardise,
359
+ suffix_used = suffix_used,
360
+
361
+ # Neural net attributes
362
+ matching_variables = matching_variables,
363
+ model_dir_name = model_dir_name,
364
+ file_step_suffix = file_step_suffix,
365
+
366
+ exported_model = [exported_model],
367
+
368
+ fuzzy_method = fuzzy_method,
369
+ score_cut_off = score_cut_off,
370
+ text_columns = text_columns,
371
+ weights = weights,
372
+ model_type = model_type,
373
+ labels_list = labels_list,
374
+
375
+
376
+ # These are variables that are added on later
377
+ # Pytorch optional variables
378
+ word_to_index = word_to_index,
379
+ cat_to_idx = cat_to_idx,
380
+ device = device,
381
+ vocab = vocab,
382
+
383
+ # Join data
384
+ file_name = '',
385
+ ref_name = '',
386
+ df_name = '',
387
+ search_df = pd.DataFrame(),
388
+ excluded_df = pd.DataFrame(),
389
+ pre_filter_search_df = pd.DataFrame(),
390
+ search_df_not_matched = pd.DataFrame(),
391
+ search_df_cleaned = pd.DataFrame(),
392
+ search_address_cols = [],
393
+ search_postcode_col = [],
394
+ search_df_key_field = 'index',
395
+
396
+ ref_df = pd.DataFrame(),
397
+ ref_df_cleaned = pd.DataFrame(),
398
+ ref_pre_filter = pd.DataFrame(),
399
+ ref_address_cols = [],
400
+ new_join_col = [],
401
+ #in_joincol_list = [],
402
+ existing_match_cols = [],
403
+ standard_llpg_format = [],
404
+
405
+
406
+ # Results attributes
407
+ match_results_output = pd.DataFrame(),
408
+ predict_df_nnet = pd.DataFrame(),
409
+
410
+ # Other attributes generated during training
411
+ compare_all_candidates = [],
412
+ diag_shortlist = [],
413
+ diag_best_match = [],
414
+
415
+ results_on_orig_df = pd.DataFrame(),
416
+ summary = "",
417
+ output_summary = "",
418
+
419
+ match_outputs_name = "",
420
+ results_orig_df_name = "",
421
+
422
+ # Post dataset preparation variables
423
+ search_df_after_stand = pd.DataFrame(),
424
+ ref_df_after_stand = pd.DataFrame(),
425
+ search_df_after_stand_series = pd.Series(),
426
+ ref_df_after_stand_series = pd.Series(),
427
+
428
+ search_df_after_full_stand = pd.DataFrame(),
429
+ ref_df_after_full_stand = pd.DataFrame(),
430
+ search_df_after_stand_series_full_stand = pd.Series(),
431
+ ref_df_after_stand_series_full_stand = pd.Series(),
432
+
433
+ # Abort flag if the matcher couldn't even get the results of the first match
434
+ abort_flag = False
435
+ )
tools/fuzzy_match.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import Dict, List, Tuple, Type
4
+ from datetime import datetime
5
+ from rapidfuzz import fuzz, process
6
+ import gradio as gr
7
+
8
+ PandasDataFrame = Type[pd.DataFrame]
9
+ PandasSeries = Type[pd.Series]
10
+ MatchedResults = Dict[str,Tuple[str,int]]
11
+ array = List[str]
12
+
13
+ today = datetime.now().strftime("%d%m%Y")
14
+ today_rev = datetime.now().strftime("%Y%m%d")
15
+
16
+ from tools.constants import no_number_fuzzy_match_limit, fuzzy_match_limit
17
+
18
+ def string_match_array(to_match:array, choices:array,
19
+ index_name:str, matched_name:str) -> PandasDataFrame:
20
+
21
+ temp = {name: process.extractOne(name,choices)
22
+ for name in to_match}
23
+
24
+ return _create_frame(matched_results=temp, index_name=index_name,
25
+ matched_name=matched_name)
26
+
27
+ # Fuzzy match algorithm
28
+ def create_fuzzy_matched_col(df:PandasDataFrame, orig_match_address_series:PandasSeries, pred_match_address_series:PandasSeries, fuzzy_method:"WRatio", match_score=95):
29
+
30
+ results = []
31
+
32
+ for orig_index, orig_string in df[orig_match_address_series].items():
33
+
34
+ predict_string = df[pred_match_address_series][orig_index]
35
+
36
+ if (orig_string == '') and (predict_string == ''):
37
+ results.append(np.nan)
38
+
39
+ else:
40
+ fuzz_score = process.extract(orig_string, [predict_string], scorer= getattr(fuzz, fuzzy_method))
41
+ results.append(fuzz_score[0][1])
42
+
43
+ new_result_col_score = (orig_match_address_series + "_fuzz_score")
44
+ new_result_col_match = (orig_match_address_series + "_fuzz_match")
45
+
46
+ df[new_result_col_score] = results
47
+ df[new_result_col_match] = df[new_result_col_score] >= match_score
48
+ #df[new_result_col_match][df[new_result_col_score].isna()] = np.nan
49
+ df.loc[df[new_result_col_score].isna(), new_result_col_match] = np.nan
50
+
51
+ return df
52
+
53
+ def string_match_by_post_code_multiple(match_address_series:PandasSeries, reference_address_series:PandasSeries,
54
+ search_limit=100, scorer_name="token_set_ratio", progress=gr.Progress())-> MatchedResults:
55
+ '''
56
+ Matches by Series values; for example idx is post code and
57
+ values address. Search field is reduced by comparing same post codes address reference_address_series.
58
+
59
+ Default scorer is fuzz.Wratio. This tries to weight the different algorithms
60
+ to give the best score.
61
+ Choice of ratio type seems to make a big difference. Looking at this link:
62
+ https://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
63
+ and this one:
64
+ https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
65
+
66
+ '''
67
+
68
+ def do_one_match(reference_addresses: pd.Series, scorer: callable, search_limit: int, postcode_match: str, search_addresses: pd.Series) -> MatchedResults:
69
+
70
+ def _prepare_results(search_addresses, reference_addresses, matched, postcode_match):
71
+
72
+ # Create a list to store the results
73
+ results = []
74
+
75
+ # Iterate through the matched dataframe and store results in the list
76
+ for i, search_address in enumerate(search_addresses):
77
+ for j, reference_address in enumerate(reference_addresses):
78
+ score = matched[i][j]
79
+ results.append((postcode_match, search_address, reference_address, score))
80
+
81
+ # Create a dataframe from the results list
82
+ matched_out = pd.DataFrame(results, columns=['postcode_search', 'fuzzy_match_search_address', 'fuzzy_match_reference_address', 'fuzzy_score'])
83
+
84
+ return matched_out
85
+
86
+ try:
87
+ if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
88
+ matched = process.cdist(search_addresses.values, [reference_addresses], scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)
89
+
90
+ # Transform results into a dataframe
91
+ matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)
92
+
93
+ else: # 1+ addresses
94
+ matched = process.cdist(search_addresses.values, reference_addresses.values, scorer=scorer, score_cutoff=fuzzy_match_limit, workers=-1)
95
+
96
+ # Transform results into a dataframe
97
+ matched_out = _prepare_results(search_addresses, reference_addresses, matched, postcode_match)
98
+
99
+ # Sort the matched results by score in descending order
100
+ matched_out = matched_out.sort_values(by='fuzzy_score', ascending=False)
101
+
102
+ # Keep only the top search_limit number of results - doesn't work anymore when working with multiple results
103
+ #matched_out = matched_out.head(search_limit)
104
+
105
+ except KeyError:
106
+ matched_out = pd.DataFrame()
107
+
108
+ return matched_out
109
+
110
+ def apply_fuzzy_matching(postcode_match:str, search_addresses:PandasSeries, reference_addresses:PandasSeries, scorer:callable, search_limit:int)-> tuple:
111
+
112
+ try:
113
+ matched = do_one_match(reference_addresses, scorer, search_limit, postcode_match, search_addresses)
114
+ return matched
115
+ except KeyError:
116
+ matched = pd.DataFrame() #[("NA", 0)] # for _ in range(1, search_limit + 1)]
117
+ return matched
118
+
119
+ print("Fuzzy match column length: ", len(match_address_series))
120
+ print("Fuzzy Reference column length: ", len(reference_address_series))
121
+
122
+ match_address_series = match_address_series.rename_axis('postcode_search')
123
+ match_address_df = pd.DataFrame(match_address_series.reset_index())
124
+ match_address_df['index'] = list(range(0,len(match_address_df)))
125
+
126
+ reference_address_series = reference_address_series.rename_axis('postcode_search')
127
+ reference_address_df = pd.DataFrame(reference_address_series.reset_index())
128
+ reference_address_df['index'] = list(range(0,len(reference_address_df)))
129
+
130
+
131
+ # Apply the match functions to each address
132
+ scorer = getattr(fuzz, scorer_name)
133
+ results = {}
134
+ #counter = 0
135
+
136
+ index_list = []
137
+ match_list = []
138
+ search_addresses_list = []
139
+ reference_addresses_list = []
140
+
141
+ unique_postcodes = pd.unique(match_address_df['postcode_search'])
142
+
143
+ for postcode_match in progress.tqdm(unique_postcodes, desc="Fuzzy matching", unit="fuzzy matched postcodes"):
144
+
145
+ postcode_match_list = [postcode_match]
146
+ search_indexes = pd.Series()
147
+ search_addresses = pd.Series()
148
+ reference_addresses = pd.Series()
149
+
150
+ try:
151
+ search_indexes = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "index"]
152
+ search_addresses = match_address_df.loc[match_address_df["postcode_search"].isin(postcode_match_list), "search_address_stand"]
153
+ reference_addresses = reference_address_df.loc[reference_address_df["postcode_search"].isin(postcode_match_list), "ref_address_stand"]
154
+
155
+ if isinstance(reference_addresses, str): # reference_addresses can be a str-> 1 address per postcode
156
+ reference_addresses = pd.Series(reference_addresses)
157
+ except KeyError:
158
+ reference_addresses = pd.Series("NA")
159
+
160
+ matched = apply_fuzzy_matching(postcode_match, search_addresses, reference_addresses, scorer, search_limit)
161
+
162
+ # Write to output lists
163
+ match_list.extend([matched])
164
+ index_list.extend(search_indexes.tolist())
165
+ search_addresses_list.extend(search_addresses.tolist())
166
+ reference_addresses_list.extend(reference_addresses.tolist())
167
+
168
+ out_frame = pd.concat(match_list)
169
+
170
+ return out_frame
171
+
172
+ def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cleaned, ref_df_after_stand, fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col):
173
+
174
+ ## Diagnostics
175
+
176
+ diag_shortlist, diag_best_match = refine_export_results(results_df=results,\
177
+ matched_df = search_df_after_stand, ref_list_df = ref_df_after_stand,
178
+ fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
179
+
180
+ ## Fuzzy search results
181
+
182
+ match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
183
+ 'full_match',
184
+ 'full_number_match',
185
+ 'flat_number_match',
186
+ 'room_number_match',
187
+ 'block_number_match',
188
+ 'unit_number_match',
189
+ 'property_number_match',
190
+ 'close_postcode_match',
191
+ 'house_court_name_match',
192
+ 'fuzzy_score_match',
193
+ "fuzzy_score",
194
+ "wratio_score",
195
+ 'property_number_search', 'property_number_reference',
196
+ 'flat_number_search', 'flat_number_reference',
197
+ 'room_number_search', 'room_number_reference',
198
+ 'unit_number_search', 'unit_number_reference',
199
+ 'block_number_search', 'block_number_reference',
200
+ 'house_court_name_search', 'house_court_name_reference',
201
+ "search_mod_address", 'reference_mod_address','Postcode']
202
+
203
+ # Join results data onto the original housing list to create the full output
204
+ search_df_cleaned_join_cols = [search_df_key_field, "full_address","postcode"]
205
+
206
+ match_results_output = search_df_cleaned[search_df_cleaned_join_cols].merge(
207
+ diag_best_match[match_results_cols], how = "left", left_on = "full_address", right_on = "search_orig_address")
208
+
209
+ match_results_output = match_results_output.drop(["postcode", "search_orig_address"], axis = 1).rename(columns={"full_address":"search_orig_address"})
210
+
211
+ # Join UPRN back onto the data from reference data
212
+ joined_ref_cols = ["fulladdress", "Reference file"]
213
+ joined_ref_cols.extend(new_join_col)
214
+
215
+ match_results_output = pd.merge(match_results_output,ref_df_cleaned[joined_ref_cols].drop_duplicates("fulladdress"), how = "left", left_on = "reference_orig_address",right_on = "fulladdress").drop("fulladdress", axis = 1)
216
+
217
+ # Convert long keys to string to avoid data loss
218
+ match_results_output[search_df_key_field] = match_results_output[search_df_key_field].astype("str")
219
+ match_results_output[new_join_col] = match_results_output[new_join_col].astype("string")
220
+ match_results_output["standardised_address"] = standardise
221
+
222
+ match_results_output = match_results_output.sort_values(search_df_key_field, ascending = True)
223
+
224
+ return match_results_output, diag_shortlist, diag_best_match
225
+
226
+ def create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col, fuzzy_col="fuzzy_score", search_mod_address = "search_mod_address", resolve_tie_breaks=True, no_number_fuzzy_match_limit=no_number_fuzzy_match_limit):
227
+ '''
228
+ Create a shortlist of the best matches from a list of suggested matches
229
+ '''
230
+
231
+ ## Calculate highest fuzzy score from all candidates, keep all candidates with matching highest fuzzy score
232
+ results_max_fuzzy_score = results_df.groupby(matched_col)[fuzzy_col].max().reset_index().rename(columns={fuzzy_col: "max_fuzzy_score"}).drop_duplicates(subset=matched_col)
233
+
234
+ results_df = pd.merge(results_df, results_max_fuzzy_score, how = "left", on = matched_col)
235
+
236
+ diag_shortlist = results_df[(results_df[fuzzy_col] == results_df["max_fuzzy_score"])]
237
+
238
+ # Fuzzy match limit for records with no numbers in it is 0.95 or the provided fuzzy_match_limit, whichever is higher
239
+ #diag_shortlist["fuzzy_score_match"] = diag_shortlist[fuzzy_col] >= fuzzy_match_limit
240
+ diag_shortlist.loc[diag_shortlist[fuzzy_col] >= fuzzy_match_limit, "fuzzy_score_match"] = True
241
+
242
+ ### Count number of numbers in search string
243
+ # Using .loc
244
+ diag_shortlist.loc[:, "number_count_search_string"] = diag_shortlist.loc[:, search_mod_address].str.count(r'\d')
245
+ diag_shortlist.loc[:, "no_numbers_in_search_string"] = (diag_shortlist.loc[:, "number_count_search_string"] == 0)
246
+
247
+
248
+ # Replace fuzzy_score_match values for addresses with no numbers in them
249
+ diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] >= no_number_fuzzy_match_limit), "fuzzy_score_match"] = True
250
+ diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True) & (diag_shortlist[fuzzy_col] < no_number_fuzzy_match_limit), "fuzzy_score_match"] = False
251
+
252
+ # If blocking on street, don't match addresses with 0 numbers in. There are too many options and the matches are rarely good
253
+ if blocker_col == "Street":
254
+ diag_shortlist.loc[(diag_shortlist["no_numbers_in_search_string"]==True), "fuzzy_score_match"] = False
255
+
256
+ diag_shortlist = diag_shortlist.fillna("").infer_objects(copy=False).drop(["number_count_search_string", "no_numbers_in_search_string"], axis = 1)
257
+
258
+ # Following considers full matches to be those that match on property number and flat number, and the postcode is relatively close.
259
+ #print(diag_shortlist.columns)
260
+ diag_shortlist["property_number_match"] = (diag_shortlist["property_number_search"] == diag_shortlist["property_number_reference"])
261
+ diag_shortlist["flat_number_match"] = (diag_shortlist['flat_number_search'] == diag_shortlist['flat_number_reference'])
262
+ diag_shortlist["room_number_match"] = (diag_shortlist['room_number_search'] == diag_shortlist['room_number_reference'])
263
+ diag_shortlist["block_number_match"] = (diag_shortlist['block_number_search'] == diag_shortlist['block_number_reference'])
264
+ diag_shortlist["unit_number_match"] = (diag_shortlist['unit_number_search'] == diag_shortlist['unit_number_reference'])
265
+ diag_shortlist["house_court_name_match"] = (diag_shortlist['house_court_name_search'] == diag_shortlist['house_court_name_reference'])
266
+
267
+ # Full number match is currently considered only a match between property number and flat number
268
+
269
+ diag_shortlist['full_number_match'] = (diag_shortlist["property_number_match"] == True) &\
270
+ (diag_shortlist["flat_number_match"] == True) &\
271
+ (diag_shortlist["room_number_match"] == True) &\
272
+ (diag_shortlist["block_number_match"] == True) &\
273
+ (diag_shortlist["unit_number_match"] == True) &\
274
+ (diag_shortlist["house_court_name_match"] == True)
275
+
276
+
277
+ ### Postcodes need to be close together, so all the characters should match apart from the last two
278
+ diag_shortlist['close_postcode_match'] = diag_shortlist['postcode'].str.lower().str.replace(" ","").str[:-2] == diag_shortlist['Postcode'].str.lower().str.replace(" ","").str[:-2]
279
+
280
+
281
+ diag_shortlist["full_match"] = (diag_shortlist["fuzzy_score_match"] == True) &\
282
+ (diag_shortlist['full_number_match'] == True) &\
283
+ (diag_shortlist['close_postcode_match'] == True)
284
+
285
+ diag_shortlist = diag_shortlist.rename(columns = {"reference_list_address":"reference_mod_address"})
286
+
287
+ ### Dealing with tie breaks ##
288
+ # Do a backup simple Wratio search on the open text to act as a tie breaker when the fuzzy scores are identical
289
+ # fuzz.WRatio
290
+ if resolve_tie_breaks == True:
291
+ def compare_strings_wratio(row, scorer = fuzz.ratio, fuzzy_col = fuzzy_col):
292
+ search_score = process.cdist([row[search_mod_address]], [row["reference_mod_address"]], scorer=scorer)
293
+ return search_score[0][0]
294
+
295
+ diag_shortlist_dups = diag_shortlist[diag_shortlist['full_number_match'] == True]
296
+ diag_shortlist_dups = diag_shortlist_dups.loc[diag_shortlist_dups.duplicated(subset= [search_mod_address, 'full_number_match', "room_number_search", fuzzy_col], keep=False)]
297
+
298
+ if not diag_shortlist_dups.empty:
299
+ diag_shortlist_dups["wratio_score"] = diag_shortlist_dups.apply(compare_strings_wratio, axis=1)
300
+
301
+ diag_shortlist = diag_shortlist.merge(diag_shortlist_dups[["wratio_score"]], left_index=True, right_index=True, how = "left")
302
+
303
+ if 'wratio_score' not in diag_shortlist.columns:
304
+ diag_shortlist['wratio_score'] = ''
305
+
306
+ # Order by best score
307
+ diag_shortlist = diag_shortlist.sort_values([
308
+ search_mod_address, 'full_match', 'full_number_match', fuzzy_col, "wratio_score"],
309
+ ascending = [True, False, False, False, False])
310
+
311
+ return diag_shortlist
312
+
313
+ def refine_export_results(results_df:PandasDataFrame,
314
+ matched_df:PandasDataFrame,
315
+ ref_list_df:PandasDataFrame,
316
+ matched_col="fuzzy_match_search_address",
317
+ ref_list_col="fuzzy_match_reference_address",
318
+ final_matched_address_col="search_address_stand",
319
+ final_ref_address_col="ref_address_stand",
320
+ orig_matched_address_col = "full_address",
321
+ orig_ref_address_col = "fulladdress",
322
+ fuzzy_match_limit=fuzzy_match_limit,
323
+ blocker_col="Postcode") -> PandasDataFrame:
324
+ '''
325
+ This function takes a result file from the fuzzy search, then refines the 'matched results' according
326
+ the score limit specified by the user and exports results list, matched and unmatched files.
327
+ '''
328
+
329
+ # Rename score column
330
+ results_df = results_df.rename(columns = {"score":"fuzzy_score"})
331
+
332
+ # Remove empty addresses
333
+ results_df = results_df[results_df[matched_col] !=0 ]
334
+
335
+ ### Join property number and flat/room number etc. onto results_df
336
+ ref_list_df["ref_index"] = ref_list_df.index
337
+ ref_join_cols = ["ref_index", final_ref_address_col, "property_number","flat_number","room_number","block_number", "unit_number", 'house_court_name', orig_ref_address_col,"Postcode"]
338
+ ref_list_df = ref_list_df[ref_join_cols].rename(columns={orig_ref_address_col: "reference_orig_address", final_ref_address_col:'reference_list_address'})
339
+
340
+ results_df = results_df.merge(ref_list_df, how = "left", left_on = ref_list_col, right_on = "reference_list_address")
341
+
342
+
343
+ ### Join on relevant details from the standardised match dataframe
344
+ matched_df_cols = [final_matched_address_col,"property_number","flat_number","room_number", "block_number", "unit_number", 'house_court_name', orig_matched_address_col, "postcode"]
345
+ matched_df = matched_df[matched_df_cols].rename(columns={orig_matched_address_col:"search_orig_address",final_matched_address_col:'search_mod_address'})
346
+
347
+ results_df = results_df.merge(matched_df, how = "left", left_on = matched_col, right_on = "search_mod_address", suffixes=("_reference", "_search"))
348
+
349
+ # Choose your best matches from the list of options
350
+ diag_shortlist = create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col)
351
+
352
+ ### Create matched results output ###
353
+ # Columns for the output match_results file in order
354
+ match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
355
+ 'full_match',
356
+ 'full_number_match',
357
+ 'flat_number_match',
358
+ 'room_number_match',
359
+ 'block_number_match',
360
+ 'unit_number_match',
361
+ 'house_court_name_match',
362
+ 'property_number_match',
363
+ 'close_postcode_match',
364
+ 'fuzzy_score_match',
365
+ "fuzzy_score",
366
+ "wratio_score",
367
+ 'property_number_search', 'property_number_reference',
368
+ 'flat_number_search', 'flat_number_reference',
369
+ 'room_number_search', 'room_number_reference',
370
+ 'block_number_search', 'block_number_reference',
371
+ 'unit_number_search', 'unit_number_reference',
372
+ 'house_court_name_search', 'house_court_name_reference',
373
+ "search_mod_address", 'reference_mod_address', 'postcode','Postcode']
374
+
375
+ diag_shortlist = diag_shortlist[match_results_cols]
376
+
377
+ # Choose best match from the shortlist that has been ordered according to score descending
378
+ diag_best_match = diag_shortlist[match_results_cols].drop_duplicates("search_mod_address")
379
+
380
+ return diag_shortlist, diag_best_match
381
+
382
+ def join_to_orig_df(match_results_output:PandasDataFrame, search_df:PandasDataFrame, search_df_key_field:str, new_join_col:List[str]) -> PandasDataFrame:
383
+ '''
384
+ Following the fuzzy match, join the match results back to the original search dataframe to create a results dataframe.
385
+ '''
386
+ match_results_output_success = match_results_output[match_results_output["full_match"]==True]
387
+
388
+ # If you're joining to the original df on index you will need to recreate the index again
389
+
390
+ match_results_output_success = match_results_output_success.rename(columns={
391
+ "reference_orig_address":"Reference matched address",
392
+ "full_match":"Matched with reference address",
393
+ 'uprn':'UPRN'
394
+ })
395
+
396
+ ref_df_after_stand_cols = ["ref_index", "Reference matched address","Matched with reference address", "Reference file", search_df_key_field]
397
+ ref_df_after_stand_cols.extend(new_join_col)
398
+
399
+
400
+ if (search_df_key_field == "index"):
401
+ # Check index is int
402
+ print("Search df key field is index")
403
+ #match_results_output_success[search_df_key_field] = match_results_output_success[search_df_key_field].astype(float).astype(int)
404
+ results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols], on = search_df_key_field, how = "left", suffixes = ('', '_y'))
405
+ else:
406
+ results_for_orig_df_join = search_df.merge(match_results_output_success[ref_df_after_stand_cols],how = "left", on = search_df_key_field, suffixes = ('', '_y'))
407
+
408
+ # If the join columns already exist in the search_df, then use the new column to fill in the NAs in the original column, then delete the new column
409
+
410
+ if "Reference matched address_y" in results_for_orig_df_join.columns:
411
+ results_for_orig_df_join['Reference matched address'] = results_for_orig_df_join['Reference matched address'].fillna(results_for_orig_df_join['Reference matched address_y']).infer_objects(copy=False)
412
+
413
+ if "Matched with reference address_y" in results_for_orig_df_join.columns:
414
+ results_for_orig_df_join['Matched with reference address'] = pd.Series(np.where(results_for_orig_df_join['Matched with reference address_y'].notna(), results_for_orig_df_join['Matched with reference address_y'], results_for_orig_df_join['Matched with reference address']))
415
+
416
+ #results_for_orig_df_join['Matched with reference address'] = results_for_orig_df_join['Matched with reference address'].fillna(results_for_orig_df_join['Matched with reference address_y']).infer_objects(copy=False)
417
+
418
+ if "Reference file_y" in results_for_orig_df_join.columns:
419
+ results_for_orig_df_join['Reference file'] = results_for_orig_df_join['Reference file'].fillna(results_for_orig_df_join['Reference file_y']).infer_objects(copy=False)
420
+
421
+ if "UPRN_y" in results_for_orig_df_join.columns:
422
+ results_for_orig_df_join['UPRN'] = results_for_orig_df_join['UPRN'].fillna(results_for_orig_df_join['UPRN_y']).infer_objects(copy=False)
423
+
424
+ # Drop columns that aren't useful
425
+ results_for_orig_df_join = results_for_orig_df_join.drop(['Reference matched address_y', 'Matched with reference address_y', 'Reference file_y', 'search_df_key_field_y', 'UPRN_y', 'index_y', "full_address_search","postcode_search", "full_address_1", "full_address_2", "full_address",
426
+ "address_stand", "property_number","prop_number" "flat_number" "apart_number" "first_sec_number" "room_number"], axis = 1, errors = "ignore")
427
+
428
+ # Replace blanks with NA, fix UPRNs
429
+ results_for_orig_df_join = results_for_orig_df_join.replace(r'^\s*$', np.nan, regex=True)
430
+
431
+ results_for_orig_df_join[new_join_col] = results_for_orig_df_join[new_join_col].astype(str).replace(".0","", regex=False).replace("nan","", regex=False)
432
+
433
+ # Replace cells with only 'nan' with blank
434
+ results_for_orig_df_join = results_for_orig_df_join.replace(r'^nan$', "", regex=True)
435
+
436
+
437
+ return results_for_orig_df_join
tools/gradio.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ def detect_file_type(filename):
5
+ """Detect the file type based on its extension."""
6
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
7
+ return 'csv'
8
+ elif filename.endswith('.xlsx'):
9
+ return 'xlsx'
10
+ elif filename.endswith('.parquet'):
11
+ return 'parquet'
12
+ else:
13
+ raise ValueError("Unsupported file type.")
14
+
15
+ def read_file(filename):
16
+ """Read the file based on its detected type."""
17
+ file_type = detect_file_type(filename)
18
+
19
+ if file_type == 'csv':
20
+ return pd.read_csv(filename, low_memory=False)
21
+ elif file_type == 'xlsx':
22
+ return pd.read_excel(filename)
23
+ elif file_type == 'parquet':
24
+ return pd.read_parquet(filename)
25
+
26
+
27
+ def initial_data_load(in_file):
28
+ new_choices = []
29
+ concat_choices = []
30
+ output_message = ""
31
+ results_df = pd.DataFrame()
32
+ df = pd.DataFrame()
33
+
34
+ file_list = [string.name for string in in_file]
35
+
36
+ data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
37
+ if data_file_names:
38
+ df = read_file(data_file_names[0])
39
+ else:
40
+ error_message = "No data file found."
41
+ return error_message, gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, results_df
42
+
43
+ results_file_names = [string for string in file_list if "results_on_orig" in string.lower()]
44
+ if results_file_names:
45
+ results_df = read_file(results_file_names[0])
46
+
47
+ new_choices = list(df.columns)
48
+ concat_choices.extend(new_choices)
49
+
50
+ output_message = "Data successfully loaded"
51
+
52
+ return output_message, gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, results_df
53
+
54
+
55
+ def dummy_function(in_colnames):
56
+ """
57
+ A dummy function that exists just so that dropdown updates work correctly.
58
+ """
59
+ return None
60
+
61
+
62
+ def clear_inputs(in_file, in_ref, in_text):
63
+ return gr.File.update(value=[]), gr.File.update(value=[]), gr.Textbox.update(value='')
tools/matcher_funcs.py ADDED
@@ -0,0 +1,1300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ from typing import Dict, List, Tuple, Type
6
+ import time
7
+ import re
8
+ import math
9
+ from datetime import datetime
10
+ import copy
11
+ import gradio as gr
12
+
13
+ PandasDataFrame = Type[pd.DataFrame]
14
+ PandasSeries = Type[pd.Series]
15
+ MatchedResults = Dict[str,Tuple[str,int]]
16
+ array = List[str]
17
+
18
+ today = datetime.now().strftime("%d%m%Y")
19
+ today_rev = datetime.now().strftime("%Y%m%d")
20
+ today_month_rev = datetime.now().strftime("%Y%m")
21
+
22
+ # Constants
23
+ run_fuzzy_match = True
24
+ run_nnet_match = True
25
+ run_standardise = True
26
+
27
+ from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, check_no_number_addresses, extract_street_name, remove_non_postal
28
+ from tools.standardise import standardise_wrapper_func
29
+ from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
30
+
31
+ # Neural network functions
32
+ ### Predict function for imported model
33
+ from tools.model_predict import full_predict_func, full_predict_torch, post_predict_clean
34
+ from tools.recordlinkage_funcs import score_based_match, check_matches_against_fuzzy
35
+ from tools.gradio import initial_data_load
36
+
37
+ # API functions
38
+ from tools.addressbase_api_funcs import places_api_query
39
+
40
+ # Maximum number of neural net predictions in a single batch
41
+ from tools.constants import max_predict_len, MatcherClass
42
+
43
+
44
+ # Load in data functions
45
+
46
+ def detect_file_type(filename):
47
+ """Detect the file type based on its extension."""
48
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
49
+ return 'csv'
50
+ elif filename.endswith('.xlsx'):
51
+ return 'xlsx'
52
+ elif filename.endswith('.parquet'):
53
+ return 'parquet'
54
+ else:
55
+ raise ValueError("Unsupported file type.")
56
+
57
+ def read_file(filename):
58
+ """Read the file based on its detected type."""
59
+ file_type = detect_file_type(filename)
60
+
61
+ if file_type == 'csv':
62
+ return pd.read_csv(filename, low_memory=False)
63
+ elif file_type == 'xlsx':
64
+ return pd.read_excel(filename)
65
+ elif file_type == 'parquet':
66
+ return pd.read_parquet(filename)
67
+
68
+ def get_file_name(in_name):
69
+ # Corrected regex pattern
70
+ match = re.search(r'\\(?!.*\\)(.*)', in_name)
71
+ if match:
72
+ matched_result = match.group(1)
73
+ else:
74
+ matched_result = None
75
+
76
+ return matched_result
77
+
78
+ def filter_not_matched(
79
+ matched_results: pd.DataFrame,
80
+ search_df: pd.DataFrame,
81
+ key_col: str
82
+ ) -> pd.DataFrame:
83
+ """Filters search_df to only rows with key_col not in matched_results"""
84
+
85
+ # Validate inputs
86
+ if not isinstance(matched_results, pd.DataFrame):
87
+ raise TypeError("not_matched_results must be a Pandas DataFrame")
88
+
89
+ if not isinstance(search_df, pd.DataFrame):
90
+ raise TypeError("search_df must be a Pandas DataFrame")
91
+
92
+ if not isinstance(key_col, str):
93
+ raise TypeError("key_col must be a string")
94
+
95
+ if key_col not in matched_results.columns:
96
+ raise ValueError(f"{key_col} not a column in matched_results")
97
+
98
+ matched_results_success = matched_results[matched_results["full_match"]==True]
99
+
100
+ # Filter search_df
101
+ #print(search_df.columns)
102
+ #print(key_col)
103
+
104
+ matched = search_df[key_col].astype(str).isin(matched_results_success[key_col].astype(str))#.drop(['level_0','index'], axis = 1, errors="ignore").reset_index() #
105
+
106
+ return search_df.iloc[np.where(~matched)[0]] # search_df[~matched]
107
+
108
+ def run_all_api_calls(in_api_key:str, Matcher:MatcherClass, query_type:str, progress=gr.Progress()):
109
+ if in_api_key == "":
110
+ print ("No API key provided, please provide one to continue")
111
+ return Matcher
112
+ else:
113
+ # Call the API
114
+ #Matcher.ref_df = pd.DataFrame()
115
+
116
+ # Check if the ref_df file already exists
117
+ def check_and_create_api_folder():
118
+ # Check if the environmental variable is available
119
+ file_path = os.environ.get('ADDRESSBASE_API_OUT') # Replace 'YOUR_ENV_VARIABLE_NAME' with the name of your environmental variable
120
+
121
+ if file_path is None:
122
+ # Environmental variable is not set
123
+ print("API output environmental variable not set.")
124
+ # Create the 'api/' folder if it doesn't already exist
125
+ api_folder_path = 'api/'
126
+ if not os.path.exists(api_folder_path):
127
+ os.makedirs(api_folder_path)
128
+ print(f"'{api_folder_path}' folder created.")
129
+ else:
130
+ # Environmental variable is set
131
+ api_folder_path = file_path
132
+ print(f"Environmental variable found: {api_folder_path}")
133
+
134
+ return api_folder_path
135
+
136
+ api_output_folder = check_and_create_api_folder()
137
+
138
+ # Check if the file exists
139
+ print("Matcher file name: ", Matcher.file_name)
140
+ search_file_name_without_extension = re.sub(r'\.[^.]+$', '', Matcher.file_name)
141
+ #print("Search file name without extension: ", search_file_name_without_extension)
142
+ api_ref_save_loc = api_output_folder + search_file_name_without_extension + "_api_" + today_month_rev + "_" + query_type + "_ckpt"
143
+ print("API reference save location: ", api_ref_save_loc)
144
+
145
+ # Allow for csv, parquet and gzipped csv files
146
+ if os.path.isfile(api_ref_save_loc + ".csv"):
147
+ print("API reference CSV file found")
148
+ Matcher.ref_df = pd.read_csv(api_ref_save_loc + ".csv")
149
+ elif os.path.isfile(api_ref_save_loc + ".parquet"):
150
+ print("API reference Parquet file found")
151
+ Matcher.ref_df = pd.read_parquet(api_ref_save_loc + ".parquet")
152
+ elif os.path.isfile(api_ref_save_loc + ".csv.gz"):
153
+ print("API reference gzipped CSV file found")
154
+ Matcher.ref_df = pd.read_csv(api_ref_save_loc + ".csv.gz", compression='gzip')
155
+ else:
156
+ print("API reference file not found, querying API for reference data.")
157
+
158
+
159
+ def conduct_api_loop(in_query, in_api_key, query_type, i, api_ref_save_loc, loop_list, api_search_index):
160
+ ref_addresses = places_api_query(in_query, in_api_key, query_type)
161
+
162
+ ref_addresses['Address_row_number'] = api_search_index[i]
163
+
164
+ loop_list.append(ref_addresses)
165
+
166
+ if (i + 1) % 500 == 0:
167
+ print("Saving api call checkpoint for query:", str(i + 1))
168
+
169
+ pd.concat(loop_list).to_parquet(api_ref_save_loc + ".parquet", index=False)
170
+
171
+ return loop_list
172
+
173
+ def check_postcode(postcode):
174
+ # Remove spaces on the ends or in the middle of the postcode, and any symbols
175
+ cleaned_postcode = re.sub(r'[^\w\s]|[\s]', '', postcode)
176
+ # Ensure that the postcode meets the specified format
177
+ postcode_pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?[0-9][A-Z]{2}|GIR0AA|GIR0A{2}|[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?[0-9]{1}?)\b'
178
+ match = re.match(postcode_pattern, cleaned_postcode)
179
+ if match and len(cleaned_postcode) in (6, 7):
180
+ return cleaned_postcode # Return the matched postcode string
181
+ else:
182
+ return None # Return None if no match is found
183
+
184
+ if query_type == "Address":
185
+ save_file = True
186
+ # Do an API call for each unique address
187
+
188
+ if not Matcher.ref_df.empty:
189
+ api_search_df = Matcher.search_df.copy().drop(list(set(Matcher.ref_df["Address_row_number"])))
190
+
191
+ else:
192
+ print("Matcher ref_df data empty")
193
+ api_search_df = Matcher.search_df.copy()
194
+
195
+ i = 0
196
+ loop_df = Matcher.ref_df
197
+ loop_list = [Matcher.ref_df]
198
+
199
+ for address in progress.tqdm(api_search_df['full_address_postcode'], desc= "Making API calls", unit="addresses", total=len(api_search_df['full_address_postcode'])):
200
+ print("Query number: " + str(i+1), "with address: ", address)
201
+
202
+ api_search_index = api_search_df.index
203
+
204
+ loop_list = conduct_api_loop(address, in_api_key, query_type, i, api_ref_save_loc, loop_list, api_search_index)
205
+
206
+ i += 1
207
+
208
+ loop_df = pd.concat(loop_list)
209
+ Matcher.ref_df = loop_df.drop_duplicates(keep='first', ignore_index=True)
210
+
211
+
212
+ elif query_type == "Postcode":
213
+ save_file = True
214
+ # Do an API call for each unique postcode. Each API call can only return 100 results maximum :/
215
+
216
+ if not Matcher.ref_df.empty:
217
+ print("Excluding postcodes that already exist in API call data.")
218
+
219
+ # Retain original index values after filtering
220
+ Matcher.search_df["index_keep"] = Matcher.search_df.index
221
+
222
+ if 'invalid_request' in Matcher.ref_df.columns and 'Address_row_number' in Matcher.ref_df.columns:
223
+ print("Joining on invalid_request column")
224
+ Matcher.search_df = Matcher.search_df.merge(Matcher.ref_df[['Address_row_number', 'invalid_request']].drop_duplicates(subset="Address_row_number"), left_on = Matcher.search_df_key_field, right_on='Address_row_number', how='left')
225
+
226
+ elif not 'invalid_request' in Matcher.search_df.columns:
227
+ Matcher.search_df['invalid_request'] = False
228
+
229
+ postcode_col = Matcher.search_postcode_col[0]
230
+
231
+ # Check ref_df df against cleaned and non-cleaned postcodes
232
+ Matcher.search_df[postcode_col] = Matcher.search_df[postcode_col].astype(str)
233
+ search_df_cleaned_pcodes = Matcher.search_df[postcode_col].apply(check_postcode)
234
+ ref_df_cleaned_pcodes = Matcher.ref_df['POSTCODE_LOCATOR'].dropna().apply(check_postcode)
235
+
236
+ api_search_df = Matcher.search_df.copy().loc[
237
+ ~Matcher.search_df[postcode_col].isin(Matcher.ref_df['POSTCODE_LOCATOR']) &
238
+ ~(Matcher.search_df['invalid_request']==True) &
239
+ ~(search_df_cleaned_pcodes.isin(ref_df_cleaned_pcodes)), :]
240
+
241
+ #api_search_index = api_search_df["index_keep"]
242
+ #api_search_df.index = api_search_index
243
+
244
+ print("Remaining invalid request count: ", Matcher.search_df['invalid_request'].value_counts())
245
+
246
+ else:
247
+ print("Matcher ref_df data empty")
248
+ api_search_df = Matcher.search_df.copy()
249
+ api_search_index = api_search_df.index
250
+ api_search_df['index_keep'] = api_search_index
251
+
252
+ postcode_col = Matcher.search_postcode_col[0]
253
+
254
+ unique_pcodes = api_search_df.loc[:, ["index_keep", postcode_col]].drop_duplicates(subset=[postcode_col], keep='first')
255
+ print("Unique postcodes: ", unique_pcodes[postcode_col])
256
+
257
+ # Apply the function to each postcode in the Series
258
+ unique_pcodes["cleaned_unique_postcodes"] = unique_pcodes[postcode_col].apply(check_postcode)
259
+
260
+ # Filter out the postcodes that comply with the specified format
261
+ valid_unique_postcodes = unique_pcodes.dropna(subset=["cleaned_unique_postcodes"])
262
+
263
+ valid_postcode_search_index = valid_unique_postcodes['index_keep']
264
+ valid_postcode_search_index_list = valid_postcode_search_index.tolist()
265
+
266
+ if not valid_unique_postcodes.empty:
267
+
268
+ print("Unique valid postcodes: ", valid_unique_postcodes)
269
+ print("Number of unique valid postcodes: ", len(valid_unique_postcodes))
270
+
271
+ tic = time.perf_counter()
272
+
273
+ i = 0
274
+ loop_df = Matcher.ref_df
275
+ loop_list = [Matcher.ref_df]
276
+
277
+ for pcode in progress.tqdm(valid_unique_postcodes["cleaned_unique_postcodes"], desc= "Making API calls", unit="unique postcodes", total=len(valid_unique_postcodes["cleaned_unique_postcodes"])):
278
+ #api_search_index = api_search_df.index
279
+
280
+ print("Query number: " + str(i+1), " with postcode: ", pcode, " and index: ", valid_postcode_search_index_list[i])
281
+
282
+ loop_list = conduct_api_loop(pcode, in_api_key, query_type, i, api_ref_save_loc, loop_list, valid_postcode_search_index_list)
283
+
284
+ i += 1
285
+
286
+ loop_df = pd.concat(loop_list)
287
+ Matcher.ref_df = loop_df.drop_duplicates(keep='first', ignore_index=True)
288
+
289
+ toc = time.perf_counter()
290
+ print("API call time in seconds: ", toc-tic)
291
+ else:
292
+ print("No valid postcodes found.")
293
+
294
+ elif query_type == "UPRN":
295
+ save_file = True
296
+ # Do an API call for each unique address
297
+
298
+ if not Matcher.ref_df.empty:
299
+ api_search_df = Matcher.search_df.copy().drop(list(set(Matcher.ref_df["Address_row_number"])))
300
+
301
+ else:
302
+ print("Matcher ref_df data empty")
303
+ api_search_df = Matcher.search_df.copy()
304
+
305
+ i = 0
306
+ loop_df = Matcher.ref_df
307
+ loop_list = [Matcher.ref_df]
308
+ uprn_check_col = 'ADR_UPRN'
309
+
310
+ for uprn in progress.tqdm(api_search_df[uprn_check_col], desc= "Making API calls", unit="UPRNs", total=len(api_search_df[uprn_check_col])):
311
+ print("Query number: " + str(i+1), "with address: ", uprn)
312
+
313
+ api_search_index = api_search_df.index
314
+
315
+ loop_list = conduct_api_loop(uprn, in_api_key, query_type, i, api_ref_save_loc, loop_list, api_search_index)
316
+
317
+ i += 1
318
+
319
+ loop_df = pd.concat(loop_list)
320
+ Matcher.ref_df = loop_df.drop_duplicates(keep='first', ignore_index=True)
321
+
322
+ else:
323
+ print("Reference file loaded from file, no API calls made.")
324
+ save_file = False
325
+
326
+ # Post API call processing
327
+
328
+ Matcher.ref_name = "API"
329
+ #Matcher.ref_df = Matcher.ref_df.reset_index(drop=True)
330
+ Matcher.ref_df['Reference file'] = Matcher.ref_name
331
+
332
+ if query_type == "Postcode":
333
+ #print(Matcher.ref_df.columns)
334
+
335
+ cols_of_interest = ["ADDRESS", "ORGANISATION", "SAO_TEXT", "SAO_START_NUMBER", "SAO_START_SUFFIX", "SAO_END_NUMBER", "SAO_END_SUFFIX", "PAO_TEXT", "PAO_START_NUMBER", "PAO_START_SUFFIX", "PAO_END_NUMBER", "PAO_END_SUFFIX", "STREET_DESCRIPTION", "TOWN_NAME" ,"ADMINISTRATIVE_AREA", "LOCALITY_NAME", "POSTCODE_LOCATOR", "UPRN", "PARENT_UPRN", "USRN", "LPI_KEY", "RPC", "LOGICAL_STATUS_CODE", "CLASSIFICATION_CODE", "LOCAL_CUSTODIAN_CODE", "COUNTRY_CODE", "POSTAL_ADDRESS_CODE", "BLPU_STATE_CODE", "LAST_UPDATE_DATE", "ENTRY_DATE", "STREET_STATE_CODE", "STREET_CLASSIFICATION_CODE", "LPI_LOGICAL_STATUS_CODE", "invalid_request", "Address_row_number", "Reference file"]
336
+
337
+ try:
338
+ # Attempt to select only the columns of interest
339
+ Matcher.ref_df = Matcher.ref_df[cols_of_interest]
340
+ except KeyError as e:
341
+ missing_columns = [col for col in e.args[0][1:-1].split(", ") if col not in cols_of_interest]
342
+ # Handle the missing columns gracefully
343
+ print(f"Some columns are missing: {missing_columns}")
344
+
345
+ #if "LOCAL_CUSTODIAN_CODE" in Matcher.ref_df.columns:
346
+ # These are items that are 'owned' by Ordnance Survey like telephone boxes, bus shelters
347
+ # Matcher.ref_df = Matcher.ref_df.loc[Matcher.ref_df["LOCAL_CUSTODIAN_CODE"] != 7655,:]
348
+
349
+ if save_file:
350
+ print("Saving reference file to: " + api_ref_save_loc[:-5] + ".parquet")
351
+ Matcher.ref_df.to_parquet(api_ref_save_loc + ".parquet", index=False) # Save checkpoint as well
352
+ Matcher.ref_df.to_parquet(api_ref_save_loc[:-5] + ".parquet", index=False)
353
+
354
+ if Matcher.ref_df.empty:
355
+ print ("No reference data found with API")
356
+ return Matcher
357
+
358
+ return Matcher
359
+
360
+ def check_ref_data_exists(Matcher:MatcherClass, ref_data_state:PandasDataFrame, in_ref:List[str], in_refcol:List[str], in_api:List[str], in_api_key:str, query_type:str, progress=gr.Progress()):
361
+ '''
362
+ Check for reference address data, do some preprocessing, and load in from the Addressbase API if required.
363
+ '''
364
+
365
+ # Check if reference data loaded, bring in if already there
366
+ if not ref_data_state.empty:
367
+ Matcher.ref_df = ref_data_state
368
+ Matcher.ref_name = get_file_name(in_ref[0].name)
369
+ Matcher.ref_df["Reference file"] = Matcher.ref_name
370
+
371
+ # Otherwise check for file name and load in. If nothing found, fail
372
+ else:
373
+ Matcher.ref_df = pd.DataFrame()
374
+
375
+ if not in_ref:
376
+ if in_api==False:
377
+ print ("No reference file provided, please provide one to continue")
378
+ return Matcher
379
+ # Check if api call required and api key is provided
380
+ else:
381
+ Matcher = run_all_api_calls(in_api_key, Matcher, query_type)
382
+
383
+ else:
384
+ Matcher.ref_name = get_file_name(in_ref[0].name)
385
+
386
+ # Concatenate all in reference files together
387
+ for ref_file in in_ref:
388
+ #print(ref_file.name)
389
+ temp_ref_file = read_file(ref_file.name)
390
+
391
+ file_name_out = get_file_name(ref_file.name)
392
+ temp_ref_file["Reference file"] = file_name_out
393
+
394
+ Matcher.ref_df = pd.concat([Matcher.ref_df, temp_ref_file])
395
+
396
+ # For the neural net model to work, the llpg columns have to be in the LPI format (e.g. with columns SaoText, SaoStartNumber etc. Here we check if we have that format.
397
+
398
+ if 'Address_LPI' in Matcher.ref_df.columns:
399
+ Matcher.ref_df = Matcher.ref_df.rename(columns={
400
+ "Name_LPI": "PaoText",
401
+ "Num_LPI": "PaoStartNumber",
402
+ "Num_Suffix_LPI":"PaoStartSuffix",
403
+ "Number End_LPI":"PaoEndNumber",
404
+ "Number_End_Suffix_LPI":"PaoEndSuffix",
405
+
406
+ "Secondary_Name_LPI":"SaoText",
407
+ "Secondary_Num_LPI":"SaoStartNumber",
408
+ "Secondary_Num_Suffix_LPI":"SaoStartSuffix",
409
+ "Secondary_Num_End_LPI":"SaoEndNumber",
410
+ "Secondary_Num_End_Suffix_LPI":"SaoEndSuffix",
411
+ "Postcode_LPI":"Postcode",
412
+ "Postal_Town_LPI":"PostTown",
413
+ "UPRN_BLPU": "UPRN"
414
+ })
415
+
416
+ #print("Matcher reference file: ", Matcher.ref_df['Reference file'])
417
+
418
+ # Check if the source is the Addressbase places API
419
+ if Matcher.ref_df.iloc[0]['Reference file'] == 'API' or '_api_' in Matcher.ref_df.iloc[0]['Reference file']:
420
+ Matcher.ref_df = Matcher.ref_df.rename(columns={
421
+ "ORGANISATION_NAME": "Organisation",
422
+ "ORGANISATION": "Organisation",
423
+ "PAO_TEXT": "PaoText",
424
+ "PAO_START_NUMBER": "PaoStartNumber",
425
+ "PAO_START_SUFFIX":"PaoStartSuffix",
426
+ "PAO_END_NUMBER":"PaoEndNumber",
427
+ "PAO_END_SUFFIX":"PaoEndSuffix",
428
+ "STREET_DESCRIPTION":"Street",
429
+
430
+ "SAO_TEXT":"SaoText",
431
+ "SAO_START_NUMBER":"SaoStartNumber",
432
+ "SAO_START_SUFFIX":"SaoStartSuffix",
433
+ "SAO_END_NUMBER":"SaoEndNumber",
434
+ "SAO_END_SUFFIX":"SaoEndSuffix",
435
+
436
+ "POSTCODE_LOCATOR":"Postcode",
437
+ "TOWN_NAME":"PostTown",
438
+ "LOCALITY_NAME":"LocalityName",
439
+ "ADMINISTRATIVE_AREA":"AdministrativeArea"
440
+ }, errors="ignore")
441
+
442
+ # Check ref_df file format
443
+ # If standard format, or it's an API call
444
+ if 'SaoText' in Matcher.ref_df.columns or in_api:
445
+ Matcher.standard_llpg_format = True
446
+ Matcher.ref_address_cols = ["Organisation", "SaoStartNumber", "SaoStartSuffix", "SaoEndNumber", "SaoEndSuffix", "SaoText", "PaoStartNumber", "PaoStartSuffix", "PaoEndNumber",
447
+ "PaoEndSuffix", "PaoText", "Street", "PostTown", "Postcode"]
448
+ # Add columns from the list if they don't exist
449
+ for col in Matcher.ref_address_cols:
450
+ if col not in Matcher.ref_df:
451
+ Matcher.ref_df[col] = np.nan
452
+ else:
453
+ Matcher.standard_llpg_format = False
454
+ Matcher.ref_address_cols = in_refcol
455
+ Matcher.ref_df = Matcher.ref_df.rename(columns={Matcher.ref_address_cols[-1]:"Postcode"})
456
+ Matcher.ref_address_cols[-1] = "Postcode"
457
+
458
+
459
+ # Reset index for ref_df as multiple files may have been combined with identical indices
460
+ Matcher.ref_df = Matcher.ref_df.reset_index() #.drop(["index","level_0"], axis = 1, errors="ignore").reset_index().drop(["index","level_0"], axis = 1, errors="ignore")
461
+ Matcher.ref_df.index.name = 'index'
462
+
463
+ return Matcher
464
+
465
+ def check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api):
466
+ # Assign join field if not known
467
+ if not Matcher.search_df_key_field:
468
+ Matcher.search_df_key_field = "index"
469
+
470
+ # Set search address cols as entered column names
471
+ #print("In colnames in check match data: ", in_colnames)
472
+ Matcher.search_address_cols = in_colnames
473
+
474
+ # Check if data loaded already and bring it in
475
+ if not data_state.empty:
476
+
477
+ Matcher.search_df = data_state
478
+
479
+
480
+
481
+ Matcher.search_df['index'] = Matcher.search_df.index
482
+
483
+ else:
484
+ Matcher.search_df = pd.DataFrame()
485
+
486
+ # If someone has just entered open text, just load this instead
487
+ if in_text:
488
+ Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
489
+
490
+ # If two matcher files are loaded in, the algorithm will combine them together
491
+ if Matcher.search_df.empty and in_file:
492
+ output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
493
+
494
+ file_list = [string.name for string in in_file]
495
+ data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
496
+
497
+ #print("Data file names: ", data_file_names)
498
+ Matcher.file_name = get_file_name(data_file_names[0])
499
+
500
+ # search_df makes column to use as index
501
+ Matcher.search_df['index'] = Matcher.search_df.index
502
+
503
+
504
+ # Join previously created results file onto search_df if previous results file exists
505
+ if not results_data_state.empty:
506
+
507
+ print("Joining on previous results file")
508
+ Matcher.results_on_orig_df = results_data_state.copy()
509
+ Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
510
+
511
+ # If no join on column suggested, assume the user wants the UPRN
512
+ # print("in_joincol: ", in_joincol)
513
+
514
+ if not in_joincol:
515
+ Matcher.new_join_col = ['UPRN']
516
+ #Matcher.new_join_col = Matcher.new_join_col#[0]
517
+
518
+ else:
519
+ Matcher.new_join_col = in_joincol
520
+ #Matcher.new_join_col = Matcher.new_join_col
521
+
522
+ # Extract the column names from the input data
523
+ print("In colnames: ", in_colnames)
524
+
525
+ if len(in_colnames) > 1:
526
+ Matcher.search_postcode_col = [in_colnames[-1]]
527
+
528
+ print("Postcode col: ", Matcher.search_postcode_col)
529
+
530
+ elif len(in_colnames) == 1:
531
+ Matcher.search_df['full_address_postcode'] = Matcher.search_df[in_colnames[0]]
532
+ Matcher.search_postcode_col = ['full_address_postcode']
533
+ Matcher.search_address_cols.append('full_address_postcode')
534
+
535
+ # Check for column that indicates there are existing matches. The code will then search this column for entries, and will remove them from the data to be searched
536
+ Matcher.existing_match_cols = in_existing
537
+
538
+ if in_existing:
539
+ if "Matched with reference address" in Matcher.search_df.columns:
540
+ Matcher.search_df.loc[~Matcher.search_df[in_existing].isna(), "Matched with reference address"] = True
541
+ else: Matcher.search_df["Matched with reference address"] = ~Matcher.search_df[in_existing].isna()
542
+
543
+ print("Shape of search_df before filtering is: ", Matcher.search_df.shape)
544
+
545
+ ### Filter addresses to those with length > 0
546
+ zero_length_search_df = Matcher.search_df.copy()[Matcher.search_address_cols]
547
+ zero_length_search_df = zero_length_search_df.fillna('').infer_objects(copy=False)
548
+ Matcher.search_df["address_cols_joined"] = zero_length_search_df.astype(str).sum(axis=1).str.strip()
549
+
550
+ length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
551
+
552
+
553
+ ### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
554
+ if not in_api:
555
+ if Matcher.filter_to_lambeth_pcodes == True:
556
+ Matcher.search_df["postcode_search_area"] = Matcher.search_df[Matcher.search_postcode_col[0]].str.strip().str.upper().str.replace(" ", "").str[:-2]
557
+ Matcher.ref_df["postcode_search_area"] = Matcher.ref_df["Postcode"].str.strip().str.upper().str.replace(" ", "").str[:-2]
558
+
559
+ unique_ref_pcode_area = (Matcher.ref_df["postcode_search_area"][Matcher.ref_df["postcode_search_area"].str.len() > 3]).unique()
560
+ postcode_found_in_search = Matcher.search_df["postcode_search_area"].isin(unique_ref_pcode_area)
561
+
562
+ Matcher.search_df["Excluded from search"] = "Included in search"
563
+ Matcher.search_df.loc[~(postcode_found_in_search), "Excluded from search"] = "Postcode area not found"
564
+ Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
565
+ Matcher.pre_filter_search_df = Matcher.search_df.copy()#.drop(["index", "level_0"], axis = 1, errors = "ignore").reset_index()
566
+ Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("address_cols_joined", axis = 1)
567
+
568
+ Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
569
+ Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
570
+
571
+
572
+ # Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
573
+ if "Matched with reference address" in Matcher.search_df.columns:
574
+ previously_matched = Matcher.pre_filter_search_df["Matched with reference address"] == True
575
+ Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
576
+
577
+ Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0) | (previously_matched)]
578
+ Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0) & ~(previously_matched)]
579
+
580
+ else:
581
+ Matcher.excluded_df = Matcher.search_df.copy()[~(postcode_found_in_search) | ~(length_more_than_0)]
582
+ Matcher.search_df = Matcher.search_df[(postcode_found_in_search) & (length_more_than_0)]
583
+
584
+ print("Shape of ref_df before filtering is: ", Matcher.ref_df.shape)
585
+
586
+ unique_search_pcode_area = (Matcher.search_df["postcode_search_area"]).unique()
587
+ postcode_found_in_ref = Matcher.ref_df["postcode_search_area"].isin(unique_search_pcode_area)
588
+ Matcher.ref_df = Matcher.ref_df[postcode_found_in_ref]
589
+
590
+ Matcher.pre_filter_search_df = Matcher.pre_filter_search_df.drop("postcode_search_area", axis = 1)
591
+ Matcher.search_df = Matcher.search_df.drop("postcode_search_area", axis = 1)
592
+ Matcher.ref_df = Matcher.ref_df.drop("postcode_search_area", axis = 1)
593
+ Matcher.excluded_df = Matcher.excluded_df.drop("postcode_search_area", axis = 1)
594
+ else:
595
+ Matcher.pre_filter_search_df = Matcher.search_df.copy()
596
+ Matcher.search_df.loc[~(length_more_than_0), "Excluded from search"] = "Address length 0"
597
+
598
+ Matcher.excluded_df = Matcher.search_df[~(length_more_than_0)]
599
+ Matcher.search_df = Matcher.search_df[length_more_than_0]
600
+
601
+
602
+ Matcher.search_df = Matcher.search_df.drop("address_cols_joined", axis = 1, errors="ignore")
603
+ Matcher.excluded_df = Matcher.excluded_df.drop("address_cols_joined", axis = 1, errors="ignore")
604
+
605
+ Matcher.search_df_not_matched = Matcher.search_df
606
+
607
+
608
+ # If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
609
+ if in_api:
610
+
611
+ if in_file:
612
+ output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
613
+
614
+ file_list = [string.name for string in in_file]
615
+ data_file_names = [string for string in file_list if "results_on_orig" not in string.lower()]
616
+
617
+ Matcher.file_name = get_file_name(data_file_names[0])
618
+
619
+ else:
620
+ if in_text:
621
+ Matcher.file_name = in_text
622
+ else:
623
+ Matcher.file_name = "API call"
624
+
625
+ # Exclude records that have already been matched separately, i.e. if 'Matched with reference address' column exists, and has trues in it
626
+ if in_existing:
627
+ print("Checking for previously matched records")
628
+ Matcher.pre_filter_search_df = Matcher.search_df.copy()
629
+ previously_matched = ~Matcher.pre_filter_search_df[in_existing].isnull()
630
+ Matcher.pre_filter_search_df.loc[previously_matched, "Excluded from search"] = "Previously matched"
631
+
632
+ Matcher.excluded_df = Matcher.search_df.copy()[~(length_more_than_0) | (previously_matched)]
633
+ Matcher.search_df = Matcher.search_df[(length_more_than_0) & ~(previously_matched)]
634
+
635
+ if type(Matcher.search_df) == str: search_df_cleaned, search_df_key_field, search_address_cols = prepare_search_address_string(Matcher.search_df)
636
+ else: search_df_cleaned = prepare_search_address(Matcher.search_df, Matcher.search_address_cols, Matcher.search_postcode_col, Matcher.search_df_key_field)
637
+
638
+
639
+ Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
640
+ #Matcher.search_df = Matcher.search_df.reset_index(drop=True)
641
+ #Matcher.search_df.index.name = 'index'
642
+
643
+ return Matcher
644
+
645
+ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, Matcher, in_api, in_api_key):
646
+ '''
647
+ Load in user inputs from the Gradio interface. Convert all input types (single address, or csv input) into standardised data format that can be used downstream for the fuzzy matching.
648
+ '''
649
+ today_rev = datetime.now().strftime("%Y%m%d")
650
+
651
+ # Abort flag for if it's not even possible to attempt the first stage of the match for some reason
652
+ Matcher.abort_flag = False
653
+
654
+ ### ref_df FILES ###
655
+ # If not an API call, run this first
656
+ if not in_api:
657
+ Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
658
+
659
+ ### MATCH/SEARCH FILES ###
660
+ # If doing API calls, we need to know the search data before querying for specific addresses/postcodes
661
+ Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
662
+
663
+
664
+ # If an API call, ref_df data is loaded after
665
+ if in_api:
666
+ Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
667
+
668
+ #print("Resetting index.")
669
+ # API-called data will often have duplicate indexes in it - drop them to avoid conflicts down the line
670
+ #Matcher.ref_df = Matcher.ref_df.reset_index(drop = True)
671
+
672
+ print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
673
+ print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
674
+
675
+ Matcher.match_outputs_name = "diagnostics_initial_" + today_rev + ".csv"
676
+ Matcher.results_orig_df_name = "results_initial_" + today_rev + ".csv"
677
+
678
+ #Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
679
+ #Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
680
+
681
+ return Matcher
682
+
683
+ # DF preparation functions
684
+
685
+ # Run batch of matches
686
+ def run_match_batch(InitialMatch, batch_n, total_batches, progress=gr.Progress()):
687
+ if run_fuzzy_match == True:
688
+
689
+ overall_tic = time.perf_counter()
690
+
691
+ progress(0, desc= "Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Fuzzy match - non-standardised dataset")
692
+ df_name = "Fuzzy not standardised"
693
+
694
+ ''' FUZZY MATCHING '''
695
+
696
+ ''' Run fuzzy match on non-standardised dataset '''
697
+
698
+ FuzzyNotStdMatch = orchestrate_match_run(Matcher = copy.copy(InitialMatch), standardise = False, nnet = False, file_stub= "not_std_", df_name = df_name)
699
+
700
+ if FuzzyNotStdMatch.abort_flag == True:
701
+ message = "Nothing to match! Aborting address check."
702
+ print(message)
703
+ return message, InitialMatch
704
+
705
+ FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
706
+
707
+ if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
708
+ overall_toc = time.perf_counter()
709
+ time_out = f"The fuzzy match script took {overall_toc - overall_tic:0.1f} seconds"
710
+ FuzzyNotStdMatch.output_summary = FuzzyNotStdMatch.output_summary + " Neural net match not attempted. "# + time_out
711
+ return FuzzyNotStdMatch.output_summary, FuzzyNotStdMatch
712
+
713
+ ''' Run fuzzy match on standardised dataset '''
714
+
715
+ progress(.25, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Fuzzy match - standardised dataset")
716
+ df_name = "Fuzzy standardised"
717
+
718
+ FuzzyStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyNotStdMatch), standardise = True, nnet = False, file_stub= "std_", df_name = df_name)
719
+ FuzzyStdMatch = combine_two_matches(FuzzyNotStdMatch, FuzzyStdMatch, df_name)
720
+
721
+ ''' Continue if reference file in correct format, and neural net model exists. Also if data not too long '''
722
+ if ((len(FuzzyStdMatch.search_df_not_matched) == 0) | (FuzzyStdMatch.standard_llpg_format == False) |\
723
+ (os.path.exists(FuzzyStdMatch.model_dir_name + '/saved_model.zip') == False) | (run_nnet_match == False)):
724
+ overall_toc = time.perf_counter()
725
+ time_out = f"The fuzzy match script took {overall_toc - overall_tic:0.1f} seconds"
726
+ FuzzyStdMatch.output_summary = FuzzyStdMatch.output_summary + " Neural net match not attempted. "# + time_out
727
+ return FuzzyStdMatch.output_summary, FuzzyStdMatch
728
+
729
+ if run_nnet_match == True:
730
+
731
+ ''' NEURAL NET '''
732
+
733
+ if run_fuzzy_match == False:
734
+ FuzzyStdMatch = copy.copy(InitialMatch)
735
+ overall_tic = time.perf_counter()
736
+
737
+ ''' First on non-standardised addresses '''
738
+ progress(.50, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - non-standardised dataset")
739
+ df_name = "Neural net not standardised"
740
+
741
+ FuzzyNNetNotStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyStdMatch), standardise = False, nnet = True, file_stub= "nnet_not_std_", df_name = df_name)
742
+ FuzzyNNetNotStdMatch = combine_two_matches(FuzzyStdMatch, FuzzyNNetNotStdMatch, df_name)
743
+
744
+ if (len(FuzzyNNetNotStdMatch.search_df_not_matched) == 0):
745
+ overall_toc = time.perf_counter()
746
+ time_out = f"The whole match script took {overall_toc - overall_tic:0.1f} seconds"
747
+ FuzzyNNetNotStdMatch.output_summary = FuzzyNNetNotStdMatch.output_summary# + time_out
748
+ return FuzzyNNetNotStdMatch.output_summary, FuzzyNNetNotStdMatch
749
+
750
+ ''' Next on standardised addresses '''
751
+ progress(.75, desc="Batch " + str(batch_n+1) + " of " + str(total_batches) + ". Neural net - standardised dataset")
752
+ df_name = "Neural net standardised"
753
+
754
+ FuzzyNNetStdMatch = orchestrate_match_run(Matcher = copy.copy(FuzzyNNetNotStdMatch), standardise = True, nnet = True, file_stub= "nnet_std_", df_name = df_name)
755
+ FuzzyNNetStdMatch = combine_two_matches(FuzzyNNetNotStdMatch, FuzzyNNetStdMatch, df_name)
756
+
757
+ if run_fuzzy_match == False:
758
+ overall_toc = time.perf_counter()
759
+ time_out = f"The neural net match script took {overall_toc - overall_tic:0.1f} seconds"
760
+ FuzzyNNetStdMatch.output_summary = FuzzyNNetStdMatch.output_summary + " Only Neural net match attempted. "# + time_out
761
+ return FuzzyNNetStdMatch.output_summary, FuzzyNNetStdMatch
762
+
763
+ overall_toc = time.perf_counter()
764
+ time_out = f"The whole match script took {overall_toc - overall_tic:0.1f} seconds"
765
+
766
+ summary_of_summaries = FuzzyNotStdMatch.output_summary + "\n" + FuzzyStdMatch.output_summary + "\n" + FuzzyNNetStdMatch.output_summary + "\n" + time_out
767
+
768
+ return summary_of_summaries, FuzzyNNetStdMatch
769
+
770
+ # Overarching functions
771
+ def orchestrate_match_run(Matcher, standardise = False, nnet = False, file_stub= "not_std_", df_name = "Fuzzy not standardised"):
772
+
773
+ today_rev = datetime.now().strftime("%Y%m%d")
774
+
775
+ #print(Matcher.standardise)
776
+ Matcher.standardise = standardise
777
+
778
+ if Matcher.search_df_not_matched.empty:
779
+ print("Nothing to match! At start of preparing run.")
780
+ return Matcher
781
+
782
+ if nnet == False:
783
+ diag_shortlist,\
784
+ diag_best_match,\
785
+ match_results_output,\
786
+ results_on_orig_df,\
787
+ summary,\
788
+ search_address_cols =\
789
+ full_fuzzy_match(Matcher.search_df_not_matched.copy(),
790
+ Matcher.standardise,
791
+ Matcher.search_df_key_field,
792
+ Matcher.search_address_cols,
793
+ Matcher.search_df_cleaned,
794
+ Matcher.search_df_after_stand,
795
+ Matcher.search_df_after_full_stand,
796
+ Matcher.ref_df_cleaned,
797
+ Matcher.ref_df_after_stand,
798
+ Matcher.ref_df_after_full_stand,
799
+ Matcher.fuzzy_match_limit,
800
+ Matcher.fuzzy_scorer_used)
801
+ if match_results_output.empty:
802
+ print("Match results empty")
803
+ Matcher.abort_flag = True
804
+ return Matcher
805
+
806
+ else:
807
+ Matcher.diag_shortlist = diag_shortlist
808
+ Matcher.diag_best_match = diag_best_match
809
+ Matcher.match_results_output = match_results_output
810
+
811
+ else:
812
+ match_results_output,\
813
+ results_on_orig_df,\
814
+ summary,\
815
+ predict_df_nnet =\
816
+ full_nn_match(
817
+ Matcher.ref_address_cols,
818
+ Matcher.search_df_not_matched.copy(),
819
+ Matcher.search_address_cols,
820
+ Matcher.search_df_key_field,
821
+ Matcher.standardise,
822
+ Matcher.exported_model[0],
823
+ Matcher.matching_variables,
824
+ Matcher.text_columns,
825
+ Matcher.weights,
826
+ Matcher.fuzzy_method,
827
+ Matcher.score_cut_off,
828
+ Matcher.match_results_output.copy(),
829
+ Matcher.filter_to_lambeth_pcodes,
830
+ Matcher.model_type,
831
+ Matcher.word_to_index,
832
+ Matcher.cat_to_idx,
833
+ Matcher.device,
834
+ Matcher.vocab,
835
+ Matcher.labels_list,
836
+ Matcher.search_df_cleaned,
837
+ Matcher.ref_df_after_stand,
838
+ Matcher.search_df_after_stand,
839
+ Matcher.search_df_after_full_stand)
840
+
841
+ if match_results_output.empty:
842
+ print("Match results empty")
843
+ Matcher.abort_flag = True
844
+ return Matcher
845
+ else:
846
+ Matcher.match_results_output = match_results_output
847
+ Matcher.predict_df_nnet = predict_df_nnet
848
+
849
+ # Save to file
850
+ Matcher.results_on_orig_df = results_on_orig_df
851
+
852
+ Matcher.summary = summary
853
+
854
+ Matcher.output_summary = create_match_summary(Matcher.match_results_output, df_name = df_name)
855
+
856
+ Matcher.match_outputs_name = "diagnostics_" + file_stub + today_rev + ".csv"
857
+ Matcher.results_orig_df_name = "results_" + file_stub + today_rev + ".csv"
858
+
859
+ Matcher.match_results_output.to_csv(Matcher.match_outputs_name, index = None)
860
+ Matcher.results_on_orig_df.to_csv(Matcher.results_orig_df_name, index = None)
861
+
862
+ return Matcher
863
+
864
+ # Overarching fuzzy match function
865
+ def full_fuzzy_match(search_df:PandasDataFrame,
866
+ standardise:bool,
867
+ search_df_key_field:str,
868
+ search_address_cols:List[str],
869
+ search_df_cleaned:PandasDataFrame,
870
+ search_df_after_stand:PandasDataFrame,
871
+ search_df_after_full_stand:PandasDataFrame,
872
+ ref_df_cleaned:PandasDataFrame,
873
+ ref_df_after_stand:PandasDataFrame,
874
+ ref_df_after_full_stand:PandasDataFrame,
875
+ fuzzy_match_limit:float,
876
+ fuzzy_scorer_used:str,
877
+ new_join_col:List[str]=["UPRN"],
878
+ fuzzy_search_addr_limit:float = 100,
879
+ filter_to_lambeth_pcodes:bool=False):
880
+
881
+ '''
882
+ Compare addresses in a 'search address' dataframe with a 'reference address' dataframe by using fuzzy matching from the rapidfuzz package, blocked by postcode and then street.
883
+ '''
884
+
885
+ # Break if search item has length 0
886
+ if search_df.empty:
887
+ out_error = "Nothing to match! Just started fuzzy match."
888
+ print(out_error)
889
+ return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error,search_address_cols
890
+
891
+ # If standardise is true, replace relevant variables with standardised versions
892
+ if standardise == True:
893
+ df_name = "standardised address"
894
+ search_df_after_stand = search_df_after_full_stand
895
+ ref_df_after_stand = ref_df_after_full_stand
896
+ else:
897
+ df_name = "non-standardised address"
898
+
899
+ # RUN WITH POSTCODE AS A BLOCKER #
900
+ # Fuzzy match against reference addresses
901
+
902
+ # Remove rows from ref search series where postcode is not found in the search_df
903
+ search_df_after_stand_series = search_df_after_stand.copy().set_index('postcode_search')['search_address_stand'].sort_index()
904
+ ref_df_after_stand_series = ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand'].sort_index()
905
+
906
+ #print(search_df_after_stand_series.index.tolist())
907
+ #print(ref_df_after_stand_series.index.tolist())
908
+
909
+ ref_df_after_stand_series_checked = ref_df_after_stand_series.copy()[ref_df_after_stand_series.index.isin(search_df_after_stand_series.index.tolist())]
910
+
911
+ # pd.DataFrame(ref_df_after_stand_series_checked.to_csv("ref_df_after_stand_series_checked.csv"))
912
+
913
+ if len(ref_df_after_stand_series_checked) == 0:
914
+ print("Nothing relevant in reference data to match!")
915
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),pd.DataFrame(),"Nothing relevant in reference data to match!",search_address_cols
916
+
917
+ # 'matched' is the list for which every single row is searched for in the reference list (the ref_df).
918
+
919
+ print("Starting the fuzzy match")
920
+
921
+ tic = time.perf_counter()
922
+ results = string_match_by_post_code_multiple(match_address_series = search_df_after_stand_series.copy(),
923
+ reference_address_series = ref_df_after_stand_series_checked,
924
+ search_limit = fuzzy_search_addr_limit,
925
+ scorer_name = fuzzy_scorer_used)
926
+
927
+ toc = time.perf_counter()
928
+ print(f"Performed the fuzzy match in {toc - tic:0.1f} seconds")
929
+
930
+
931
+ # Create result dfs
932
+ match_results_output, diag_shortlist, diag_best_match = _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cleaned, ref_df_after_stand, fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Postcode")
933
+
934
+ match_results_output['match_method'] = "Fuzzy match - postcode"
935
+
936
+ search_df_not_matched = filter_not_matched(match_results_output, search_df_after_stand, search_df_key_field)
937
+
938
+
939
+ # If nothing left to match, break
940
+ if (sum(match_results_output['full_match']==False) == 0) | (sum(match_results_output[match_results_output['full_match']==False]['fuzzy_score'])==0):
941
+ print("Nothing left to match!")
942
+
943
+ summary = create_match_summary(match_results_output, df_name)
944
+
945
+ if type(search_df) != str:
946
+ results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
947
+ else: results_on_orig_df = match_results_output
948
+
949
+ return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
950
+
951
+
952
+ # RUN WITH STREET AS A BLOCKER #
953
+
954
+ ### Redo with street as blocker
955
+ search_df_after_stand_street = search_df_not_matched.copy()
956
+ search_df_after_stand_street['search_address_stand_w_pcode'] = search_df_after_stand_street['search_address_stand'] + " " + search_df_after_stand_street['postcode_search']
957
+ ref_df_after_stand['ref_address_stand_w_pcode'] = ref_df_after_stand['ref_address_stand'] + " " + ref_df_after_stand['postcode_search']
958
+
959
+ search_df_after_stand_street['street']= search_df_after_stand_street['full_address_search'].apply(extract_street_name)
960
+ # Exclude non-postal addresses from street-blocked search
961
+ search_df_after_stand_street.loc[search_df_after_stand_street['Excluded from search'] == "Excluded - non-postal address", 'street'] = ""
962
+
963
+ ### Create lookup lists
964
+ search_df_match_series_street = search_df_after_stand_street.copy().set_index('street')['search_address_stand']
965
+ ref_df_after_stand_series_street = ref_df_after_stand.copy().set_index('Street')['ref_address_stand']
966
+
967
+ # Remove rows where street is not in ref_df df
968
+ #index_check = ref_df_after_stand_series_street.index.isin(search_df_match_series_street.index)
969
+ #ref_df_after_stand_series_street_checked = ref_df_after_stand_series_street.copy()[index_check == True]
970
+
971
+ ref_df_after_stand_series_street_checked = ref_df_after_stand_series_street.copy()[ref_df_after_stand_series_street.index.isin(search_df_match_series_street.index.tolist())]
972
+
973
+ # If nothing left to match, break
974
+ if (len(ref_df_after_stand_series_street_checked) == 0) | ((len(search_df_match_series_street) == 0)):
975
+
976
+ summary = create_match_summary(match_results_output, df_name)
977
+
978
+ if type(search_df) != str:
979
+ results_on_orig_df = join_to_orig_df(match_results_output, search_df_after_stand, search_df_key_field, new_join_col)
980
+ else: results_on_orig_df = match_results_output
981
+
982
+ return diag_shortlist, diag_best_match,\
983
+ match_results_output, results_on_orig_df, summary, search_address_cols
984
+
985
+ print("Starting the fuzzy match with street as blocker")
986
+
987
+ tic = time.perf_counter()
988
+ results_st = string_match_by_post_code_multiple(match_address_series = search_df_match_series_street.copy(),
989
+ reference_address_series = ref_df_after_stand_series_street_checked.copy(),
990
+ search_limit = fuzzy_search_addr_limit,
991
+ scorer_name = fuzzy_scorer_used)
992
+
993
+ toc = time.perf_counter()
994
+
995
+ print(f"Performed the fuzzy match in {toc - tic:0.1f} seconds")
996
+
997
+ match_results_output_st, diag_shortlist_st, diag_best_match_st = _create_fuzzy_match_results_output(results_st, search_df_after_stand_street, ref_df_cleaned, ref_df_after_stand,\
998
+ fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col = "Street")
999
+ match_results_output_st['match_method'] = "Fuzzy match - street"
1000
+
1001
+ match_results_output_st_out = combine_std_df_remove_dups(match_results_output, match_results_output_st, orig_addr_col = search_df_key_field)
1002
+
1003
+ match_results_output = match_results_output_st_out
1004
+
1005
+ summary = create_match_summary(match_results_output, df_name)
1006
+
1007
+ ### Join URPN back onto orig df
1008
+
1009
+ if type(search_df) != str:
1010
+ results_on_orig_df = join_to_orig_df(match_results_output, search_df_cleaned, search_df_key_field, new_join_col)
1011
+ else: results_on_orig_df = match_results_output
1012
+
1013
+ return diag_shortlist, diag_best_match, match_results_output, results_on_orig_df, summary, search_address_cols
1014
+
1015
+ # Overarching NN function
1016
+ def full_nn_match(ref_address_cols:List[str],
1017
+ search_df:PandasDataFrame,
1018
+ search_address_cols:List[str],
1019
+ search_df_key_field:str,
1020
+ standardise:bool,
1021
+ exported_model:list,
1022
+ matching_variables:List[str],
1023
+ text_columns:List[str],
1024
+ weights:dict,
1025
+ fuzzy_method:str,
1026
+ score_cut_off:float,
1027
+ match_results:PandasDataFrame,
1028
+ filter_to_lambeth_pcodes:bool,
1029
+ model_type:str,
1030
+ word_to_index:dict,
1031
+ cat_to_idx:dict,
1032
+ device:str,
1033
+ vocab:List[str],
1034
+ labels_list:List[str],
1035
+ search_df_cleaned:PandasDataFrame,
1036
+ ref_df_after_stand:PandasDataFrame,
1037
+ search_df_after_stand:PandasDataFrame,
1038
+ search_df_after_full_stand:PandasDataFrame,
1039
+ new_join_col:List=["UPRN"]):
1040
+ '''
1041
+ Use a neural network model to partition 'search addresses' into consituent parts in the format of UK Ordnance Survey Land Property Identifier (LPI) addresses. These address components are compared individually against reference addresses in the same format to give an overall match score using the recordlinkage package.
1042
+ '''
1043
+
1044
+ # Break if search item has length 0
1045
+ if search_df.empty:
1046
+ out_error = "Nothing to match!"
1047
+ print(out_error)
1048
+ return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error, search_address_cols
1049
+
1050
+ # If it is the standardisation step, or you have come from the fuzzy match area
1051
+ if (standardise == True): # | (run_fuzzy_match == True & standardise == False):
1052
+ df_name = "standardised address"
1053
+
1054
+ search_df_after_stand = search_df_after_full_stand
1055
+
1056
+ else:
1057
+ df_name = "non-standardised address"
1058
+
1059
+ print(search_df_after_stand.shape[0])
1060
+ print(ref_df_after_stand.shape[0])
1061
+
1062
+ # Predict on search data to extract LPI address components
1063
+
1064
+ #predict_len = len(search_df_cleaned["full_address"])
1065
+ all_columns = list(search_df_cleaned) # Creates list of all column headers
1066
+ search_df_cleaned[all_columns] = search_df_cleaned[all_columns].astype(str)
1067
+ predict_data = list(search_df_after_stand['search_address_stand'])
1068
+
1069
+ ### Run predict function
1070
+ print("Starting neural net prediction for " + str(len(predict_data)) + " addresses")
1071
+
1072
+ tic = time.perf_counter()
1073
+
1074
+ # Determine the number of chunks
1075
+ num_chunks = math.ceil(len(predict_data) / max_predict_len)
1076
+ list_out_all = []
1077
+ predict_df_all = []
1078
+
1079
+ for i in range(num_chunks):
1080
+ print("Starting to predict batch " + str(i+ 1) + " of " + str(num_chunks) + " batches.")
1081
+
1082
+ start_idx = i * max_predict_len
1083
+ end_idx = start_idx + max_predict_len
1084
+
1085
+ # Extract the current chunk of data
1086
+ chunk_data = predict_data[start_idx:end_idx]
1087
+
1088
+ # Replace blank strings with a single space
1089
+ chunk_data = [" " if s in ("") else s for s in chunk_data]
1090
+
1091
+ if (model_type == "gru") | (model_type == "lstm"):
1092
+ list_out, predict_df = full_predict_torch(model=exported_model, model_type=model_type,
1093
+ input_text=chunk_data, word_to_index=word_to_index,
1094
+ cat_to_idx=cat_to_idx, device=device)
1095
+ else:
1096
+ list_out, predict_df = full_predict_func(chunk_data, exported_model, vocab, labels_list)
1097
+
1098
+ # Append the results
1099
+ list_out_all.extend(list_out)
1100
+ predict_df_all.append(predict_df)
1101
+
1102
+ # Concatenate all the results dataframes
1103
+ predict_df_all = pd.concat(predict_df_all, ignore_index=True)
1104
+
1105
+ toc = time.perf_counter()
1106
+
1107
+ print(f"Performed the NN prediction in {toc - tic:0.1f} seconds")
1108
+
1109
+ predict_df = post_predict_clean(predict_df=predict_df_all, orig_search_df=search_df_cleaned,
1110
+ ref_address_cols=ref_address_cols, search_df_key_field=search_df_key_field)
1111
+
1112
+ # Score-based matching between neural net predictions and fuzzy match results
1113
+
1114
+ # Example of recordlinkage package in use: https://towardsdatascience.com/how-to-perform-fuzzy-dataframe-row-matching-with-recordlinkage-b53ca0cb944c
1115
+
1116
+ ## Run with Postcode as blocker column
1117
+
1118
+ blocker_column = ["Postcode"]
1119
+
1120
+ scoresSBM_best_pc, matched_output_SBM_pc = score_based_match(predict_df_search = predict_df.copy(), ref_search = ref_df_after_stand.copy(),
1121
+ orig_search_df = search_df_after_stand, matching_variables = matching_variables,
1122
+ text_columns = text_columns, blocker_column = blocker_column, weights = weights, fuzzy_method = fuzzy_method, score_cut_off = score_cut_off, search_df_key_field=search_df_key_field, standardise=standardise, new_join_col=new_join_col)
1123
+
1124
+ if matched_output_SBM_pc.empty:
1125
+ error_message = "Match results empty. Exiting neural net match."
1126
+ print(error_message)
1127
+
1128
+ return pd.DataFrame(),pd.DataFrame(), error_message, predict_df
1129
+
1130
+ else:
1131
+ matched_output_SBM_pc["match_method"] = "Neural net - Postcode"
1132
+
1133
+ match_results_output_final_pc = combine_std_df_remove_dups(match_results, matched_output_SBM_pc, orig_addr_col = search_df_key_field)
1134
+
1135
+ summary_pc = create_match_summary(match_results_output_final_pc, df_name = "NNet blocked by Postcode " + df_name)
1136
+ print(summary_pc)
1137
+
1138
+ ## Run with Street as blocker column
1139
+
1140
+ blocker_column = ["Street"]
1141
+
1142
+ scoresSBM_best_st, matched_output_SBM_st = score_based_match(predict_df_search = predict_df.copy(), ref_search = ref_df_after_stand.copy(),
1143
+ orig_search_df = search_df_after_stand, matching_variables = matching_variables,
1144
+ text_columns = text_columns, blocker_column = blocker_column, weights = weights, fuzzy_method = fuzzy_method, score_cut_off = score_cut_off, search_df_key_field=search_df_key_field, standardise=standardise, new_join_col=new_join_col)
1145
+
1146
+ # If no matching pairs are found in the function above then it returns 0 - below we replace these values with the postcode blocker values (which should almost always find at least one pair unless it's a very unusual situation)
1147
+ if (type(matched_output_SBM_st) == int) | matched_output_SBM_st.empty:
1148
+ print("Nothing to match for street block")
1149
+
1150
+ matched_output_SBM_st = matched_output_SBM_pc
1151
+ matched_output_SBM_st["match_method"] = "Neural net - Postcode" #+ standard_label
1152
+ else: matched_output_SBM_st["match_method"] = "Neural net - Street" #+ standard_label
1153
+
1154
+ ### Join together old match df with new (model) match df
1155
+
1156
+ match_results_output_final_st = combine_std_df_remove_dups(match_results_output_final_pc,matched_output_SBM_st, orig_addr_col = search_df_key_field)
1157
+
1158
+ summary_street = create_match_summary(match_results_output_final_st, df_name = "NNet blocked by Street " + df_name)
1159
+ print(summary_street)
1160
+
1161
+ # I decided in the end not to use PaoStartNumber as a blocker column. I get only a couple more matches in general for a big increase in processing time
1162
+
1163
+ matched_output_SBM_po = matched_output_SBM_st
1164
+ matched_output_SBM_po["match_method"] = "Neural net - Street" #+ standard_label
1165
+
1166
+ match_results_output_final_po = match_results_output_final_st
1167
+ match_results_output_final_three = match_results_output_final_po
1168
+
1169
+ summary_three = create_match_summary(match_results_output_final_three, df_name = "fuzzy and nn model street + postcode " + df_name)
1170
+
1171
+ ### Join URPN back onto orig df
1172
+
1173
+ if type(search_df) != str:
1174
+ results_on_orig_df = join_to_orig_df(match_results_output_final_three, search_df_after_stand, search_df_key_field, new_join_col)
1175
+ else: results_on_orig_df = match_results_output_final_three
1176
+
1177
+ return match_results_output_final_three, results_on_orig_df, summary_three, predict_df
1178
+
1179
+
1180
+ # Combiner/summary functions
1181
+ def combine_std_df_remove_dups(df_not_std, df_std, orig_addr_col = "search_orig_address", match_address_series = "full_match", keep_only_duplicated = False):
1182
+
1183
+ if (df_not_std.empty) & (df_std.empty):
1184
+ return df_not_std
1185
+
1186
+ combined_std_not_matches = pd.concat([df_not_std, df_std])#, ignore_index=True)
1187
+
1188
+ if combined_std_not_matches.empty: #| ~(match_address_series in combined_std_not_matches.columns) | ~(orig_addr_col in combined_std_not_matches.columns):
1189
+ combined_std_not_matches[match_address_series] = False
1190
+
1191
+ if "full_address" in combined_std_not_matches.columns:
1192
+ combined_std_not_matches[orig_addr_col] = combined_std_not_matches["full_address"]
1193
+ combined_std_not_matches["fuzzy_score"] = 0
1194
+ return combined_std_not_matches
1195
+
1196
+ combined_std_not_matches = combined_std_not_matches.sort_values([orig_addr_col, match_address_series], ascending=False)
1197
+
1198
+ if keep_only_duplicated == True:
1199
+ combined_std_not_matches = combined_std_not_matches[combined_std_not_matches.duplicated(orig_addr_col)]
1200
+
1201
+ combined_std_not_matches_no_dups = combined_std_not_matches.drop_duplicates(orig_addr_col).sort_index()
1202
+
1203
+ return combined_std_not_matches_no_dups
1204
+
1205
+ def combine_two_matches(OrigMatchClass, NewMatchClass, df_name):
1206
+
1207
+ today_rev = datetime.now().strftime("%Y%m%d")
1208
+
1209
+ NewMatchClass.match_results_output = combine_std_df_remove_dups(OrigMatchClass.match_results_output, NewMatchClass.match_results_output, orig_addr_col = NewMatchClass.search_df_key_field)
1210
+
1211
+ NewMatchClass.results_on_orig_df = combine_std_df_remove_dups(OrigMatchClass.pre_filter_search_df, NewMatchClass.results_on_orig_df, orig_addr_col = NewMatchClass.search_df_key_field, match_address_series = 'Matched with reference address')
1212
+
1213
+
1214
+ # Filter out search results where a match was found
1215
+ NewMatchClass.pre_filter_search_df = NewMatchClass.results_on_orig_df
1216
+
1217
+ found_index = NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["Matched with reference address"] == True, NewMatchClass.search_df_key_field].astype(int)
1218
+ #print(found_index)[NewMatchClass.search_df_key_field]
1219
+
1220
+ key_field_values = NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int) # Assuming list conversion is suitable
1221
+ rows_to_drop = key_field_values[key_field_values.isin(found_index)].tolist()
1222
+ NewMatchClass.search_df_not_matched = NewMatchClass.search_df_not_matched.loc[~NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].isin(rows_to_drop),:]#.drop(rows_to_drop, axis = 0)
1223
+
1224
+ # Filter out rows from NewMatchClass.search_df_cleaned
1225
+
1226
+ filtered_rows_to_keep = NewMatchClass.search_df_cleaned[NewMatchClass.search_df_key_field].astype(int).isin(NewMatchClass.search_df_not_matched[NewMatchClass.search_df_key_field].astype(int)).to_list()
1227
+
1228
+ NewMatchClass.search_df_cleaned = NewMatchClass.search_df_cleaned.loc[filtered_rows_to_keep,:]#.drop(rows_to_drop, axis = 0)
1229
+ NewMatchClass.search_df_after_stand = NewMatchClass.search_df_after_stand.loc[filtered_rows_to_keep,:]#.drop(rows_to_drop)
1230
+ NewMatchClass.search_df_after_full_stand = NewMatchClass.search_df_after_full_stand.loc[filtered_rows_to_keep,:]#.drop(rows_to_drop)
1231
+
1232
+ ### Create lookup lists
1233
+ NewMatchClass.search_df_after_stand_series = NewMatchClass.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand'].str.lower().str.strip()
1234
+ NewMatchClass.search_df_after_stand_series_full_stand = NewMatchClass.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand'].str.lower().str.strip()
1235
+
1236
+
1237
+ match_results_output_match_score_is_0 = NewMatchClass.match_results_output[NewMatchClass.match_results_output['fuzzy_score']==0.0][["index", "fuzzy_score"]].drop_duplicates(subset='index')
1238
+ match_results_output_match_score_is_0["index"] = match_results_output_match_score_is_0["index"].astype(str)
1239
+ #NewMatchClass.results_on_orig_df["index"] = NewMatchClass.results_on_orig_df["index"].astype(str)
1240
+ NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.merge(match_results_output_match_score_is_0, on = "index", how = "left")
1241
+
1242
+ NewMatchClass.results_on_orig_df.loc[NewMatchClass.results_on_orig_df["fuzzy_score"] == 0.0, "Excluded from search"] = "Match score is 0"
1243
+ NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.drop("fuzzy_score", axis = 1)
1244
+
1245
+ # Drop any duplicates, prioritise any matches
1246
+ NewMatchClass.results_on_orig_df = NewMatchClass.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
1247
+
1248
+ NewMatchClass.output_summary = create_match_summary(NewMatchClass.match_results_output, df_name = df_name)
1249
+ print(NewMatchClass.output_summary)
1250
+
1251
+
1252
+ NewMatchClass.search_df_not_matched = filter_not_matched(NewMatchClass.match_results_output, NewMatchClass.search_df, NewMatchClass.search_df_key_field)
1253
+
1254
+ ### Rejoin the excluded matches onto the output file
1255
+ #NewMatchClass.results_on_orig_df = pd.concat([NewMatchClass.results_on_orig_df, NewMatchClass.excluded_df])
1256
+
1257
+ NewMatchClass.match_outputs_name = "match_results_output_std_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1258
+ NewMatchClass.results_orig_df_name = "results_on_orig_df_std_" + today_rev + ".csv" # + NewMatchClass.file_name + "_"
1259
+
1260
+ # Only keep essential columns
1261
+ essential_results_cols = [NewMatchClass.search_df_key_field, "Excluded from search", "Matched with reference address", "ref_index", "Reference matched address", "Reference file"]
1262
+ essential_results_cols.extend(NewMatchClass.new_join_col)
1263
+
1264
+ NewMatchClass.match_results_output.to_csv(NewMatchClass.match_outputs_name, index = None)
1265
+ NewMatchClass.results_on_orig_df[essential_results_cols].to_csv(NewMatchClass.results_orig_df_name, index = None)
1266
+
1267
+ return NewMatchClass
1268
+
1269
+ def create_match_summary(match_results_output:PandasDataFrame, df_name:str):
1270
+
1271
+ # Check if match_results_output is a dictionary-like object and has the key 'full_match'
1272
+
1273
+ if not isinstance(match_results_output, dict) or 'full_match' not in match_results_output or (len(match_results_output) == 0):
1274
+ "Nothing in match_results_output"
1275
+ full_match_count = 0
1276
+ match_fail_count = 0
1277
+ records_attempted = 0
1278
+ dataset_length = 0
1279
+ records_not_attempted = 0
1280
+ match_rate = 0
1281
+ match_fail_count_without_excluded = 0
1282
+ match_fail_rate = 0
1283
+ not_attempted_rate = 0
1284
+
1285
+ ''' Create a summary paragraph '''
1286
+ full_match_count = match_results_output['full_match'][match_results_output['full_match'] == True].count()
1287
+ match_fail_count = match_results_output['full_match'][match_results_output['full_match'] == False].count()
1288
+ records_attempted = int(sum((match_results_output['fuzzy_score']!=0.0) & ~(match_results_output['fuzzy_score'].isna())))
1289
+ dataset_length = len(match_results_output["full_match"])
1290
+ records_not_attempted = int(dataset_length - records_attempted)
1291
+ match_rate = str(round((full_match_count / dataset_length) * 100,1))
1292
+ match_fail_count_without_excluded = match_fail_count - records_not_attempted
1293
+ match_fail_rate = str(round(((match_fail_count_without_excluded) / dataset_length) * 100,1))
1294
+ not_attempted_rate = str(round((records_not_attempted / dataset_length) * 100,1))
1295
+
1296
+ summary = ("For the " + df_name + " dataset (" + str(dataset_length) + " records), the fuzzy matching algorithm successfully matched " + str(full_match_count) +
1297
+ " records (" + match_rate + "%). The algorithm could not attempt to match " + str(records_not_attempted) +
1298
+ " records (" + not_attempted_rate + "%). There are " + str(match_fail_count_without_excluded) + " records left to potentially match.")
1299
+
1300
+ return summary
tools/model_predict.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import tensorflow as tf # Tensorflow use deprecated
2
+ import torch
3
+ import pandas as pd
4
+ import numpy as np
5
+ from typing import Type, Dict, List, Tuple
6
+ from datetime import datetime
7
+
8
+ PandasDataFrame = Type[pd.DataFrame]
9
+ PandasSeries = Type[pd.Series]
10
+ MatchedResults = Dict[str,Tuple[str,int]]
11
+ array = List[str]
12
+
13
+ today = datetime.now().strftime("%d%m%Y")
14
+ today_rev = datetime.now().strftime("%Y%m%d")
15
+
16
+ # # Neural net functions
17
+
18
+
19
+
20
+
21
+
22
+ def vocab_lookup(characters: str, vocab) -> (int, np.ndarray):
23
+ """
24
+ Taken from the function from the addressnet package by Jason Rigby
25
+
26
+ Converts a string into a list of vocab indices
27
+ :param characters: the string to convert
28
+ :param training: if True, artificial typos will be introduced
29
+ :return: the string length and an array of vocab indices
30
+ """
31
+ result = list()
32
+ for c in characters.lower():
33
+ try:
34
+ result.append(vocab.index(c) + 1)
35
+ except ValueError:
36
+ result.append(0)
37
+ return len(characters), np.array(result, dtype=np.int64)
38
+
39
+
40
+ # ## Neural net predictor functions
41
+
42
+ def text_to_model_input_local(in_text, vocab, model_type = "estimator"):
43
+ addresses_out = []
44
+ model_input_out = []
45
+ encoded_text = []
46
+
47
+ # Calculate longest string length
48
+ import heapq
49
+
50
+ # get the index of the largest element in the list
51
+ index = heapq.nlargest(1, range(len(in_text)), key=lambda x: len(in_text[x]))[0]
52
+
53
+ # use the index to get the corresponding string
54
+ longest_string = len(in_text[index])
55
+
56
+ #print("Longest string is: " + str(longest_string))
57
+
58
+ for x in range(0, len(in_text)):
59
+
60
+ out = vocab_lookup(in_text[x], vocab)
61
+ addresses_out.append(out)
62
+
63
+ #print(out)
64
+
65
+ # Tensorflow model use deprecated
66
+ # if model_type == "estimator":
67
+ # model_input_add= tf.train.Example(features=tf.train.Features(feature={
68
+ # 'lengths': tf.train.Feature(int64_list=tf.train.Int64List(value=[out[0]])),
69
+ # 'encoded_text': tf.train.Feature(int64_list=tf.train.Int64List(value=out[1].tolist()))
70
+ # })).SerializeToString()
71
+
72
+ # model_input_out.append(model_input_add)
73
+
74
+ if model_type == "keras":
75
+ encoded_text.append(out[1])
76
+
77
+ # Tensorflow model use deprecated
78
+ # if model_type == "keras":
79
+ # # Pad out the strings so they're all the same length. 69 seems to be the value for spaces
80
+ # model_input_out = tf.keras.utils.pad_sequences(encoded_text, maxlen=longest_string, padding="post", truncating="post", value=0)#69)
81
+
82
+
83
+ return addresses_out, model_input_out
84
+
85
+
86
+ def reformat_predictions_local(predict_out):
87
+
88
+ predictions_list_reformat = []
89
+
90
+ for x in range(0,len(predict_out['pred_output_classes'])):
91
+
92
+ new_entry = {'class_ids': predict_out['pred_output_classes'][x], 'probabilities': predict_out['probabilities'][x]}
93
+ predictions_list_reformat.append(new_entry)
94
+
95
+ return predictions_list_reformat
96
+
97
+
98
+ def predict_serve_conv_local(in_text:List[str], labels_list, predictions) -> List[Dict[str, str]]:
99
+
100
+ class_names = [l.replace("_code", "") for l in labels_list]
101
+ class_names = [l.replace("_abbreviation", "") for l in class_names]
102
+
103
+ #print(input_text)
104
+
105
+ #print(list(zip(input_text, predictions)))
106
+
107
+ for addr, res in zip(in_text, predictions):
108
+
109
+ #print(zip(input_text, predictions))
110
+
111
+ mappings = dict()
112
+
113
+
114
+ #print(addr.upper())
115
+ #print(res['class_ids'])
116
+
117
+ for char, class_id in zip(addr.upper(), res['class_ids']):
118
+ #print(char)
119
+ if class_id == 0:
120
+ continue
121
+ cls = class_names[class_id - 1]
122
+ mappings[cls] = mappings.get(cls, "") + char
123
+
124
+
125
+ #print(mappings)
126
+ yield mappings
127
+ #return mappings
128
+
129
+
130
+ def prep_predict_export(prediction_outputs, in_text):
131
+
132
+ out_list = list(prediction_outputs)
133
+
134
+ df_out = pd.DataFrame(out_list)
135
+
136
+ #print(in_text)
137
+ #print(df_out)
138
+
139
+ df_out["address"] = in_text
140
+
141
+ return out_list, df_out
142
+
143
+
144
+
145
+ def full_predict_func(list_to_predict, model, vocab, labels_list):
146
+
147
+ if hasattr(model, "summary"): # Indicates this is a keras model rather than an estimator
148
+ model_type = "keras"
149
+ else: model_type = "estimator"
150
+
151
+ list_to_predict = [x.upper() for x in list_to_predict]
152
+
153
+ addresses_out, model_input = text_to_model_input_local(list_to_predict, vocab, model_type)
154
+
155
+ if hasattr(model, "summary"):
156
+ probs = model.predict(model_input, use_multiprocessing=True)
157
+
158
+ classes = probs.argmax(axis=-1)
159
+
160
+ predictions = {'pred_output_classes':classes, 'probabilities':probs}
161
+
162
+ else:
163
+ print("Tensorflow use deprecated")
164
+ #predictions = model.signatures["predict_output"](predictor_inputs=tf.constant(model_input)) # This was for when using the contrib module
165
+ #predictions = model.signatures["serving_default"](predictor_inputs=tf.constant(model_input))
166
+
167
+ predictions_list_reformat = reformat_predictions_local(predictions)
168
+
169
+
170
+ #### Final output as list or dataframe
171
+
172
+ output = predict_serve_conv_local(list(list_to_predict), labels_list, predictions_list_reformat)
173
+
174
+ list_out, predict_df = prep_predict_export(output, list_to_predict)
175
+
176
+ # Add organisation as a column if it doesn't already exist
177
+ if 'Organisation' not in predict_df.columns:
178
+ predict_df['Organisation'] = ""
179
+
180
+ return list_out, predict_df
181
+
182
+ # -
183
+
184
+ def predict_torch(model, model_type, input_text, word_to_index, device):
185
+ #print(device)
186
+
187
+ # Convert input_text to tensor of character indices
188
+ indexed_texts = [[word_to_index.get(char, word_to_index['<UNK>']) for char in text] for text in input_text]
189
+
190
+ # Calculate max_len based on indexed_texts
191
+ max_len = max(len(text) for text in indexed_texts)
192
+
193
+ # Pad sequences and convert to tensor
194
+ padded_texts = torch.tensor([text + [word_to_index['<pad>']] * (max_len - len(text)) for text in indexed_texts])
195
+
196
+ with torch.no_grad():
197
+ texts = padded_texts.to(device)
198
+
199
+ if (model_type == "lstm") | (model_type == "gru"):
200
+ text_lengths = texts.ne(word_to_index['<pad>']).sum(dim=1)
201
+ predictions = model(texts, text_lengths)
202
+
203
+ if model_type == "transformer":
204
+ # Call model with texts and pad_idx
205
+ predictions = model(texts, word_to_index['<pad>'])
206
+
207
+ # Convert predictions to most likely category indices
208
+ _, predicted_indices = predictions.max(2)
209
+ return predicted_indices
210
+
211
+
212
+ def torch_predictions_to_dicts(input_text, predicted_indices, index_to_category):
213
+ results = []
214
+ for i, text in enumerate(input_text):
215
+ # Treat each character in the input text as a "token"
216
+ tokens = list(text) # Convert string to a list of characters
217
+
218
+ # Create a dictionary for the current text
219
+ curr_dict = {}
220
+
221
+ # Iterate over the predicted categories and the tokens together
222
+ for category_index, token in zip(predicted_indices[i], tokens):
223
+ # Convert the category index to its name
224
+ category_name = index_to_category[category_index.item()]
225
+
226
+ # Append the token to the category in the dictionary (or create the category if it doesn't exist)
227
+ if category_name in curr_dict:
228
+ curr_dict[category_name] += token # No space needed between characters
229
+ else:
230
+ curr_dict[category_name] = token
231
+
232
+ results.append(curr_dict)
233
+
234
+ return results
235
+
236
+
237
+ def torch_prep_predict_export(prediction_outputs, in_text):
238
+
239
+ #out_list = list(prediction_outputs)
240
+
241
+ df_out = pd.DataFrame(prediction_outputs).drop("IGNORE", axis = 1)
242
+
243
+ #print(in_text)
244
+ #print(df_out)
245
+
246
+ df_out["address"] = in_text
247
+
248
+ return df_out
249
+
250
+
251
+ def full_predict_torch(model, model_type, input_text, word_to_index, cat_to_idx, device):
252
+
253
+ input_text = [x.upper() for x in input_text]
254
+
255
+ predicted_indices = predict_torch(model, model_type, input_text, word_to_index, device)
256
+
257
+ index_to_category = {v: k for k, v in cat_to_idx.items()}
258
+
259
+ results_dict = torch_predictions_to_dicts(input_text, predicted_indices, index_to_category)
260
+
261
+ df_out = torch_prep_predict_export(results_dict, input_text)
262
+
263
+ return results_dict, df_out
264
+
265
+
266
+ def post_predict_clean(predict_df, orig_search_df, ref_address_cols, search_df_key_field):
267
+
268
+
269
+ # Add address to ref_address_cols
270
+ ref_address_cols_add = ref_address_cols.copy()
271
+ ref_address_cols_add.extend(['address'])
272
+
273
+ # Create column if it doesn't exist
274
+ for x in ref_address_cols:
275
+
276
+ predict_df[x] = predict_df.get(x, np.nan)
277
+
278
+ predict_df = predict_df[ref_address_cols_add]
279
+
280
+ #Columns that are in the ref and model, but are not matched in this instance, need to be filled in with blanks
281
+
282
+ predict_cols_match = list(predict_df.drop(["address"],axis=1).columns)
283
+ predict_cols_match_uprn = predict_cols_match.copy()
284
+ predict_cols_match_uprn.append("UPRN")
285
+
286
+ pred_output_missing_cols = list(set(ref_address_cols) - set(predict_cols_match))
287
+ predict_df[pred_output_missing_cols] = np.nan
288
+ predict_df = predict_df.fillna("").infer_objects(copy=False)
289
+
290
+ #Convert all columns to string
291
+
292
+ all_columns = list(predict_df) # Creates list of all column headers
293
+ predict_df[all_columns] = predict_df[all_columns].astype(str)
294
+
295
+ predict_df = predict_df.replace("\.0","",regex=True)
296
+
297
+ #When comparing with ref, the postcode existing in the data will be used to compare rather than the postcode predicted by the model. This is to minimise errors in matching
298
+
299
+ predict_df = predict_df.rename(columns={"Postcode":"Postcode_predict"})
300
+
301
+ #orig_search_df.to_csv("orig_search_df_pre_predict.csv")
302
+
303
+ orig_search_df_pc = orig_search_df[[search_df_key_field, "postcode"]].rename(columns={"postcode":"Postcode"}).reset_index(drop=True)
304
+ predict_df = predict_df.merge(orig_search_df_pc, left_index=True, right_index=True, how = "left")
305
+
306
+ #predict_df = pd.concat([predict_df, orig_search_df_pc], axis = 1)
307
+
308
+ #predict_df[search_df_key_field] = orig_search_df[search_df_key_field]
309
+
310
+ #predict_df = predict_df.drop("index", axis=1)
311
+
312
+ #predict_df['index'] = predict_df.index
313
+ predict_df[search_df_key_field] = predict_df[search_df_key_field].astype(str)
314
+
315
+ #predict_df.to_csv("predict_end_of_clean.csv")
316
+
317
+ return predict_df
318
+
tools/preparation.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Type, Dict, List, Tuple
3
+ from datetime import datetime
4
+ #import polars as pl
5
+ import re
6
+
7
+ PandasDataFrame = Type[pd.DataFrame]
8
+ PandasSeries = Type[pd.Series]
9
+ MatchedResults = Dict[str,Tuple[str,int]]
10
+ array = List[str]
11
+
12
+ today = datetime.now().strftime("%d%m%Y")
13
+ today_rev = datetime.now().strftime("%Y%m%d")
14
+
15
+
16
+ def prepare_search_address_string(
17
+ search_str: str
18
+ ) -> Tuple[pd.DataFrame, str, List[str], List[str]]:
19
+ """Extracts address and postcode from search_str into new DataFrame"""
20
+
21
+ # Validate input
22
+ if not isinstance(search_str, str):
23
+ raise TypeError("search_str must be a string")
24
+
25
+ search_df = pd.DataFrame(data={"full_address":[search_str]})
26
+
27
+ #print(search_df)
28
+
29
+ # Extract postcode
30
+ postcode_series = extract_postcode(search_df, "full_address").dropna(axis=1)[0]
31
+
32
+ # Remove postcode from address
33
+ address_series = remove_postcode(search_df, "full_address")
34
+
35
+ # Construct output DataFrame
36
+ search_df_out = pd.DataFrame()
37
+ search_df_out["full_address"] = address_series
38
+ search_df_out["postcode"] = postcode_series
39
+
40
+ # Set key field for joining
41
+ key_field = "index"
42
+
43
+ # Reset index to use as key field
44
+ search_df_out = search_df_out.reset_index()
45
+
46
+ # Define column names to return
47
+ address_cols = ["full_address"]
48
+ postcode_col = ["postcode"]
49
+
50
+ return search_df_out, key_field, address_cols, postcode_col
51
+
52
+ # def prepare_search_address(
53
+ # search_df: pd.DataFrame,
54
+ # address_cols: list,
55
+ # postcode_col: list,
56
+ # key_col: str
57
+ # ) -> Tuple[pd.DataFrame, str]:
58
+
59
+ # # Validate inputs
60
+ # if not isinstance(search_df, pd.DataFrame):
61
+ # raise TypeError("search_df must be a Pandas DataFrame")
62
+
63
+ # if not isinstance(address_cols, list):
64
+ # raise TypeError("address_cols must be a list")
65
+
66
+ # if not isinstance(postcode_col, list):
67
+ # raise TypeError("postcode_col must be a list")
68
+
69
+ # if not isinstance(key_col, str):
70
+ # raise TypeError("key_col must be a string")
71
+
72
+ # # Clean address columns
73
+ # clean_addresses = _clean_columns(search_df, address_cols)
74
+
75
+ # # Join address columns into one
76
+ # full_addresses = _join_address(clean_addresses, address_cols)
77
+
78
+ # # Add postcode column
79
+ # full_df = _add_postcode_column(full_addresses, postcode_col)
80
+
81
+ # # Remove postcode from main address if there was only one column in the input
82
+ # if postcode_col == "full_address_postcode":
83
+ # # Remove postcode from address
84
+ # address_series = remove_postcode(search_df, "full_address")
85
+ # search_df["full_address"] == address_series
86
+
87
+ # # Ensure index column
88
+ # final_df = _ensure_index(full_df, key_col)
89
+
90
+ # #print(final_df)
91
+
92
+
93
+ # return final_df, key_col
94
+
95
+ def prepare_search_address(
96
+ search_df: pd.DataFrame,
97
+ address_cols: list,
98
+ postcode_col: list,
99
+ key_col: str
100
+ ) -> Tuple[pd.DataFrame, str]:
101
+
102
+ # Validate inputs
103
+ if not isinstance(search_df, pd.DataFrame):
104
+ raise TypeError("search_df must be a Pandas DataFrame")
105
+
106
+ if not isinstance(address_cols, list):
107
+ raise TypeError("address_cols must be a list")
108
+
109
+ if not isinstance(postcode_col, list):
110
+ raise TypeError("postcode_col must be a list")
111
+
112
+ if not isinstance(key_col, str):
113
+ raise TypeError("key_col must be a string")
114
+
115
+ # Clean address columns
116
+ #search_df_polars = pl.from_dataframe(search_df)
117
+ clean_addresses = _clean_columns(search_df, address_cols)
118
+
119
+ # Join address columns into one
120
+ full_addresses = _join_address(clean_addresses, address_cols)
121
+
122
+ # Add postcode column
123
+ full_df = _add_postcode_column(full_addresses, postcode_col)
124
+
125
+ # Remove postcode from main address if there was only one column in the input
126
+ if postcode_col == "full_address_postcode":
127
+ # Remove postcode from address
128
+ address_series = remove_postcode(search_df, "full_address")
129
+ search_df["full_address"] == address_series
130
+
131
+ # Ensure index column
132
+ final_df = _ensure_index(full_df, key_col)
133
+
134
+ #print(final_df)
135
+
136
+
137
+ return final_df
138
+
139
+ # Helper functions
140
+ def _clean_columns(df, cols):
141
+ # Cleaning logic
142
+ def clean_col(col):
143
+ return col.astype(str).fillna("").infer_objects(copy=False).str.replace("nan","").str.replace("\s{2,}", " ", regex=True).str.replace(","," ").str.strip()
144
+
145
+ df[cols] = df[cols].apply(clean_col)
146
+
147
+ return df
148
+
149
+ # def _clean_columns(df, cols):
150
+ # # Cleaning logic
151
+ # #print(df)
152
+
153
+ # #if isinstance(df, pl.DataFrame):
154
+ # # print("It's a Polars DataFrame")
155
+
156
+ # def clean_col(col):
157
+ # col = col.str.replace("nan", "")
158
+ # col = col.apply(lambda x: re.sub(r'\s{2,}', ' ', str(x)), skip_nulls=False, return_dtype=str) # replace any spaces greater than one with one
159
+ # return col.str.replace(",", " ").str.strip() # replace commas with a space
160
+
161
+ # for col in cols:
162
+ # df = df.with_columns(clean_col(df[col]).alias(col))
163
+
164
+ # return df
165
+
166
+
167
+ def _join_address(df, cols):
168
+ # Joining logic
169
+ full_address = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
170
+ df["full_address"] = full_address.str.replace("\s{2,}", " ", regex=True).str.strip()
171
+
172
+ return df
173
+
174
+ def _add_postcode_column(df, postcodes):
175
+ # Add postcode column
176
+ if isinstance(postcodes, list):
177
+ postcodes = postcodes[0]
178
+
179
+ if postcodes != "full_address_postcode":
180
+ df = df.rename(columns={postcodes:"postcode"})
181
+ else:
182
+ #print(df["full_address_postcode"])
183
+ #print(extract_postcode(df,"full_address_postcode"))
184
+ df["full_address_postcode"] = extract_postcode(df,"full_address_postcode")[0] #
185
+ df = df.rename(columns={postcodes:"postcode"})
186
+ #print(df)
187
+
188
+ return df
189
+
190
+ def _ensure_index(df, index_col):
191
+ # Ensure index column exists
192
+ if ((index_col == "index") & ~("index" in df.columns)):
193
+ print("Resetting index in _ensure_index function")
194
+ df = df.reset_index()
195
+
196
+ df[index_col] = df[index_col].astype(str)
197
+
198
+ return df
199
+
200
+ def create_full_address(df):
201
+
202
+ df = df.fillna("").infer_objects(copy=False)
203
+
204
+ if "Organisation" not in df.columns:
205
+ df["Organisation"] = ""
206
+
207
+ df["full_address"] = df['Organisation'] + " " + df['SaoText'].str.replace(" - ", " REPL ").str.replace("- ", " REPLEFT ").str.replace(" -", " REPLRIGHT ") + " " + df["SaoStartNumber"].astype(str) + df["SaoStartSuffix"] + "-" + df["SaoEndNumber"].astype(str) + df["SaoEndSuffix"] + " " + df["PaoText"].str.replace(" - ", " REPL ").str.replace("- ", " REPLEFT ").str.replace(" -", " REPLRIGHT ") + " " + df["PaoStartNumber"].astype(str) + df["PaoStartSuffix"] + "-" + df["PaoEndNumber"].astype(str) + df["PaoEndSuffix"] + " " + df["Street"] + " " + df["PostTown"] + " " + df["Postcode"]
208
+
209
+ #.str.replace(r'(?<=[a-zA-Z])-(?![a-zA-Z])|(?<![a-zA-Z])-(?=[a-zA-Z])', ' ', regex=True)\
210
+
211
+ #.str.replace(".0","", regex=False)\
212
+
213
+ df["full_address"] = df["full_address"]\
214
+ .str.replace("-999","")\
215
+ .str.replace(" -"," ")\
216
+ .str.replace("- "," ")\
217
+ .str.replace(" REPL "," - ")\
218
+ .str.replace(" REPLEFT ","- ")\
219
+ .str.replace(" REPLRIGHT "," -")\
220
+ .str.replace("\s+"," ", regex=True)\
221
+ .str.strip()
222
+ #.str.replace(" "," ")\
223
+
224
+ return df["full_address"]
225
+
226
+ def prepare_ref_address(ref_df, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
227
+
228
+ if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns): standard_cols = True
229
+ else: standard_cols = False
230
+
231
+ ref_address_cols_uprn = ref_address_cols.copy()
232
+
233
+ ref_address_cols_uprn.extend(new_join_col)
234
+ ref_address_cols_uprn_w_ref = ref_address_cols_uprn.copy()
235
+ ref_address_cols_uprn_w_ref.extend(["Reference file"])
236
+
237
+ ref_df_cleaned = ref_df.copy()
238
+
239
+ # In on-prem LPI db street has been excluded, so put this back in
240
+ if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
241
+ ref_df_cleaned['Street'] = ref_df_cleaned['Address_LPI'].str.replace("\\n", " ", regex = True).apply(extract_street_name)#
242
+
243
+ if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns):
244
+ ref_df_cleaned['Organisation'] = ""
245
+
246
+ ref_df_cleaned = ref_df_cleaned[ref_address_cols_uprn_w_ref]
247
+
248
+ ref_df_cleaned = ref_df_cleaned.fillna("").infer_objects(copy=False)
249
+
250
+ all_columns = list(ref_df_cleaned) # Creates list of all column headers
251
+ ref_df_cleaned[all_columns] = ref_df_cleaned[all_columns].astype(str).fillna('').infer_objects(copy=False).replace('nan','')
252
+
253
+ ref_df_cleaned = ref_df_cleaned.replace("\.0","",regex=True)
254
+
255
+ # Create full address
256
+
257
+ all_columns = list(ref_df_cleaned) # Creates list of all column headers
258
+ ref_df_cleaned[all_columns] = ref_df_cleaned[all_columns].astype(str)
259
+
260
+ ref_df_cleaned = ref_df_cleaned.replace("nan","")
261
+ ref_df_cleaned = ref_df_cleaned.replace("\.0","",regex=True)
262
+
263
+ if standard_cols == True:
264
+ ref_df_cleaned= ref_df_cleaned[ref_address_cols_uprn_w_ref].fillna('').infer_objects(copy=False)
265
+
266
+ ref_df_cleaned["fulladdress"] = create_full_address(ref_df_cleaned[ref_address_cols_uprn_w_ref])
267
+
268
+ else:
269
+ ref_df_cleaned= ref_df_cleaned[ref_address_cols_uprn_w_ref].fillna('').infer_objects(copy=False)
270
+
271
+ full_address = ref_df_cleaned[ref_address_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
272
+ ref_df_cleaned["fulladdress"] = full_address
273
+
274
+ ref_df_cleaned["fulladdress"] = ref_df_cleaned["fulladdress"]\
275
+ .str.replace("-999","")\
276
+ .str.replace(" -"," ")\
277
+ .str.replace("- "," ")\
278
+ .str.replace(".0","", regex=False)\
279
+ .str.replace("\s{2,}", " ", regex=True)\
280
+ .str.strip()
281
+
282
+ # Create a street column if it doesn't exist by extracting street from the full address
283
+
284
+ if 'Street' not in ref_df_cleaned.columns:
285
+ ref_df_cleaned['Street'] = ref_df_cleaned["fulladdress"].apply(extract_street_name)
286
+
287
+ # Add index column
288
+ ref_df_cleaned['ref_index'] = ref_df_cleaned.index
289
+
290
+ return ref_df_cleaned
291
+
292
+ # def prepare_ref_address(ref_df:pl.DataFrame, ref_address_cols, new_join_col = ['UPRN'], standard_cols = True):
293
+
294
+ # if ('SaoText' in ref_df.columns) | ("Secondary_Name_LPI" in ref_df.columns):
295
+ # standard_cols = True
296
+ # else:
297
+ # standard_cols = False
298
+
299
+ # ref_address_cols_uprn = list(ref_address_cols) + new_join_col
300
+ # ref_df_cleaned = ref_df[ref_address_cols_uprn].fill_null("")
301
+
302
+ # # In on-prem LPI db street has been excluded, so put this back in
303
+ # if ('Street' not in ref_df_cleaned.columns) & ('Address_LPI' in ref_df_cleaned.columns):
304
+ # ref_df_cleaned = ref_df_cleaned.with_column(pl.col('Address_LPI').apply(lambda x: extract_street_name(x)).alias('Street'))
305
+
306
+ # if ('Organisation' not in ref_df_cleaned.columns) & ('SaoText' in ref_df_cleaned.columns):
307
+ # ref_df_cleaned = ref_df_cleaned.with_column(pl.lit("").alias('Organisation'))
308
+
309
+ # #ref_df_cleaned['fulladdress'] =
310
+
311
+ # if standard_cols:
312
+ # pass
313
+ # # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
314
+ # # However, you might need to convert string types to object type for full address creation which may require more than just a few lines of codes.
315
+ # else:
316
+ # pass
317
+
318
+ # # I can not write the full address code here as it depends on your extract_street_name and create_full_address function implementations.
319
+
320
+ # if 'Street' not in ref_df_cleaned.columns:
321
+ # ref_df_cleaned = ref_df_cleaned.with_column(pl.col('fulladdress').apply(extract_street_name).alias("Street"))
322
+
323
+ # # Add index column
324
+ # ref_df_cleaned = ref_df_cleaned.with_column(pl.lit('').alias('ref_index'))
325
+
326
+ # return ref_df_cleaned
327
+
328
+
329
+ def extract_postcode(df, col:str) -> PandasSeries:
330
+ '''
331
+ Extract a postcode from a string column in a dataframe
332
+ '''
333
+ postcode_series = df[col].str.upper().str.extract(pat = \
334
+ "(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$)")
335
+
336
+ return postcode_series
337
+
338
+
339
+ # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
340
+ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
341
+ '''
342
+ Highlight addresses from a pandas df where there are no numbers in the address.
343
+ '''
344
+ df["in_address_series_temp"] = df[in_address_series].str.lower()
345
+
346
+ no_numbers_series = df["in_address_series_temp"].str.contains("^(?!.*\d+).*$", regex=True)
347
+
348
+ df.loc[no_numbers_series == True, 'Excluded from search'] = "Excluded - no numbers in address"
349
+
350
+ df = df.drop("in_address_series_temp", axis = 1)
351
+
352
+ #print(df[["full_address", "Excluded from search"]])
353
+
354
+ return df
355
+
356
+
357
+ def remove_postcode(df, col:str) -> PandasSeries:
358
+ '''
359
+ Remove a postcode from a string column in a dataframe
360
+ '''
361
+ address_series_no_pcode = df[col].str.upper().str.replace(\
362
+ "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
363
+
364
+ return address_series_no_pcode
365
+
366
+ def extract_street_name(address:str) -> str:
367
+ """
368
+ Extracts the street name from the given address.
369
+
370
+ Args:
371
+ address (str): The input address string.
372
+
373
+ Returns:
374
+ str: The extracted street name, or an empty string if no match is found.
375
+
376
+ Examples:
377
+ >>> address1 = "1 Ash Park Road SE54 3HB"
378
+ >>> extract_street_name(address1)
379
+ 'Ash Park Road'
380
+
381
+ >>> address2 = "Flat 14 1 Ash Park Road SE54 3HB"
382
+ >>> extract_street_name(address2)
383
+ 'Ash Park Road'
384
+
385
+ >>> address3 = "123 Main Blvd"
386
+ >>> extract_street_name(address3)
387
+ 'Main Blvd'
388
+
389
+ >>> address4 = "456 Maple AvEnUe"
390
+ >>> extract_street_name(address4)
391
+ 'Maple AvEnUe'
392
+
393
+ >>> address5 = "789 Oak Street"
394
+ >>> extract_street_name(address5)
395
+ 'Oak Street'
396
+ """
397
+
398
+
399
+ street_types = [
400
+ 'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
401
+ 'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
402
+ 'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
403
+ 'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
404
+ 'Alley', 'Arcade','Avenue', 'Ave','Bay','Bend','Brae','Byway','Close','Corner','Cove',
405
+ 'Crescent', 'Cres','Cul-de-sac','Dell','Drive', 'Dr','Esplanade','Glen','Green','Grove','Heights', 'Hts',
406
+ 'Mews','Parade','Path','Piazza','Promenade','Quay','Ridge','Row','Terrace', 'Ter','Track','Trail','View','Villas',
407
+ 'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
408
+ ]
409
+
410
+ # Dynamically construct the regex pattern with all possible street types
411
+ street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
412
+
413
+ # The overall regex pattern to capture the street name
414
+ pattern = rf'(?:\d+\s+|\w+\s+\d+\s+|.*\d+[a-z]+\s+|.*\d+\s+)*(?P<street_name>[\w\s]+(?:{street_types_pattern}))'
415
+
416
+ def replace_postcode(address):
417
+ pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$'
418
+ return re.sub(pattern, "", address)
419
+
420
+
421
+ modified_address = replace_postcode(address.upper())
422
+ #print(modified_address)
423
+ #print(address)
424
+
425
+ # Perform a case-insensitive search
426
+ match = re.search(pattern, modified_address, re.IGNORECASE)
427
+
428
+ if match:
429
+ street_name = match.group('street_name')
430
+ return street_name.strip()
431
+ else:
432
+ return ""
433
+
434
+
435
+ # Exclude non-postal addresses
436
+
437
+ def remove_non_postal(df, in_address_series):
438
+ '''
439
+ Highlight non-postal addresses from a polars df where a string series that contain specific substrings
440
+ indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
441
+ '''
442
+ df["in_address_series_temp"] = df[in_address_series].str.lower()
443
+
444
+ garage_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bgarage\\b|\\bgarages\\b)", regex=True)
445
+ parking_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bparking\\b)", regex=True)
446
+ shed_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bshed\\b|\\bsheds\\b)", regex=True)
447
+ bike_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbike\\b|\\bbikes\\b)", regex=True)
448
+ bicycle_store_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbicycle store\\b|\\bbicycle store\\b)", regex=True)
449
+
450
+ non_postal_series = (garage_address_series | parking_address_series | shed_address_series | bike_address_series | bicycle_store_address_series)
451
+
452
+ df.loc[non_postal_series == True, 'Excluded from search'] = "Excluded - non-postal address"
453
+
454
+ df = df.drop("in_address_series_temp", axis = 1)
455
+
456
+ return df
tools/pytorch_models.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+ class TextClassifier(nn.Module):
4
+ def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
5
+ dropout, pad_idx):
6
+ super(TextClassifier, self).__init__()
7
+
8
+ # Embedding layer
9
+ self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
10
+
11
+ # GRU layers
12
+ self.rnn = nn.GRU(embedding_dim,
13
+ hidden_dim,
14
+ num_layers=n_layers,
15
+ bidirectional=True,
16
+ dropout=dropout,
17
+ batch_first=True)
18
+
19
+ # Fully connected layer
20
+ self.fc = nn.Linear(hidden_dim * 2, output_dim) # Multiply by 2 for bidirection
21
+
22
+ # Dropout layer
23
+ self.dropout = nn.Dropout(dropout)
24
+
25
+ def forward(self, text, text_lengths):
26
+ embedded = self.dropout(self.embedding(text))
27
+
28
+ # Pack sequence
29
+ packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
30
+ packed_output, _ = self.rnn(packed_embedded)
31
+
32
+ # Unpack sequence
33
+ output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
34
+
35
+ # Pass the entire output tensor to the FC layer for token-level classification
36
+ return self.fc(output)
37
+
38
+ class LSTMTextClassifier(nn.Module):
39
+ def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
40
+ dropout, pad_idx):
41
+ super(LSTMTextClassifier, self).__init__()
42
+
43
+ # Embedding layer
44
+ self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
45
+
46
+ # LSTM layers
47
+ self.rnn = nn.LSTM(embedding_dim,
48
+ hidden_dim,
49
+ num_layers=n_layers,
50
+ bidirectional=True,
51
+ dropout=dropout,
52
+ batch_first=True)
53
+
54
+ # Fully connected layer
55
+ self.fc = nn.Linear(hidden_dim * 2, output_dim) # Multiply by 2 for bidirection
56
+
57
+ # Dropout layer
58
+ self.dropout = nn.Dropout(dropout)
59
+
60
+ def forward(self, text, text_lengths):
61
+ embedded = self.dropout(self.embedding(text))
62
+
63
+ # Pack sequence
64
+ packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
65
+
66
+ # Note: LSTM returns both the output and a tuple of (hidden state, cell state)
67
+ packed_output, (hidden, cell) = self.rnn(packed_embedded)
68
+
69
+ # Unpack sequence
70
+ output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
71
+
72
+ # Pass the entire output tensor to the FC layer for token-level classification
73
+ return self.fc(output)
74
+
75
+ class PositionalEncoding(nn.Module):
76
+ def __init__(self, d_model, max_len=120):
77
+ super(PositionalEncoding, self).__init__()
78
+ self.d_model = d_model
79
+
80
+ def forward(self, x):
81
+ # If pe doesn't exist or its sequence length is different from x's sequence length
82
+ if not hasattr(self, 'pe') or self.pe.size(0) != x.size(1):
83
+ max_len = x.size(1)
84
+ pe = torch.zeros(max_len, self.d_model)
85
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
86
+ div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-math.log(10000.0) / self.d_model))
87
+ pe[:, 0::2] = torch.sin(position * div_term)
88
+ pe[:, 1::2] = torch.cos(position * div_term)
89
+ pe = pe.unsqueeze(0)
90
+ self.register_buffer('pe', pe.to(x.device))
91
+
92
+ return x + self.pe[:, :x.size(1), :]
93
+
94
+ import torch.nn as nn
95
+ import torch.nn.init as init
96
+
97
+ def weights_init_kaiming(m):
98
+ if isinstance(m, nn.Linear):
99
+ init.kaiming_uniform_(m.weight, nonlinearity='relu')
100
+ if m.bias is not None:
101
+ init.zeros_(m.bias)
102
+ elif isinstance(m, nn.Embedding):
103
+ init.kaiming_uniform_(m.weight, nonlinearity='relu')
104
+
105
+ class TransformerClassifier(nn.Module):
106
+ def __init__(self, vocab_size, embedding_dim, nhead, num_encoder_layers,
107
+ num_classes, dropout, pad_idx):
108
+ super(TransformerClassifier, self).__init__()
109
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
110
+
111
+
112
+
113
+ # Embedding layer
114
+ self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
115
+
116
+ # Positional encoding
117
+ self.pos_encoder = PositionalEncoding(embedding_dim)
118
+
119
+ # Transformer with dropout
120
+ transformer_encoder = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead, dropout=dropout, activation="gelu")
121
+ self.transformer = nn.TransformerEncoder(transformer_encoder, num_layers=num_encoder_layers)
122
+
123
+ # Classifier with dropout
124
+ self.classifier = nn.Sequential(
125
+ nn.Dropout(dropout),
126
+ nn.Linear(embedding_dim, num_classes)
127
+ )
128
+
129
+ def create_attention_mask(self, src, pad_idx):
130
+ return (src == pad_idx)
131
+
132
+ def forward(self, src, pad_idx):
133
+
134
+ # Check pad_idx
135
+ if isinstance(pad_idx, torch.Tensor) and torch.numel(pad_idx) > 1:
136
+ raise ValueError("Expected pad_idx to be a scalar value, but got a tensor with multiple elements.")
137
+
138
+ # Transpose src to have shape (seq_len, batch_size)
139
+ src = src.transpose(0, 1)
140
+
141
+ # Embedding
142
+ x = self.embedding(src)
143
+
144
+ # Positional Encoding
145
+ x = self.pos_encoder(x.to(self.device))
146
+
147
+ # Create attention mask
148
+ src_key_padding_mask = self.create_attention_mask(src.transpose(0, 1), pad_idx) # Transpose back to (batch_size, sequence_length)
149
+
150
+ # Transformer
151
+ x = self.transformer(x, src_key_padding_mask=src_key_padding_mask)
152
+
153
+ #print(model.state_dict())
154
+ # Classification
155
+ return self.classifier(x)
tools/recordlinkage_funcs.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Type, Dict, List, Tuple
3
+ import recordlinkage
4
+ from datetime import datetime
5
+
6
+ PandasDataFrame = Type[pd.DataFrame]
7
+ PandasSeries = Type[pd.Series]
8
+ MatchedResults = Dict[str,Tuple[str,int]]
9
+ array = List[str]
10
+
11
+ today = datetime.now().strftime("%d%m%Y")
12
+ today_rev = datetime.now().strftime("%Y%m%d")
13
+
14
+ from tools.constants import score_cut_off_nnet_street
15
+
16
+ # ## Recordlinkage matching functions
17
+ def compute_match(predict_df_search, ref_search, orig_search_df, matching_variables,
18
+ text_columns, blocker_column, weights, fuzzy_method):
19
+ # Use the merge command to match group1 and group2
20
+ predict_df_search[matching_variables] = predict_df_search[matching_variables].astype(str)
21
+ ref_search[matching_variables] = ref_search[matching_variables].astype(str).replace("-999","")
22
+
23
+ # SaoText needs to be exactly the same to get a 'full' match. So I moved that to the exact match group
24
+ exact_columns = list(set(matching_variables) - set(text_columns))
25
+
26
+ # Replace all blanks with a space, so they can be included in the fuzzy match searches
27
+ for column in text_columns:
28
+ predict_df_search.loc[predict_df_search[column] == '', column] = ' '
29
+ ref_search.loc[ref_search[column] == '', column] = ' '
30
+
31
+ # Score based match functions
32
+
33
+ # Create an index of all pairs
34
+ indexer = recordlinkage.Index()
35
+
36
+ # Block on selected blocker column
37
+
38
+ ## Remove all NAs from predict_df blocker column
39
+ if blocker_column[0] == "PaoStartNumber":
40
+ predict_df_search = predict_df_search[~(predict_df_search[blocker_column[0]].isna()) & ~(predict_df_search[blocker_column[0]] == '')& ~(predict_df_search[blocker_column[0]].str.contains(r'^\s*$', na=False))]
41
+
42
+
43
+ indexer.block(blocker_column) #matchkey.block(["Postcode", "PaoStartNumber"])
44
+
45
+ # Generate candidate pairs
46
+
47
+ pairsSBM = indexer.index(predict_df_search,ref_search)
48
+
49
+ print('Running with ' + blocker_column[0] + ' as blocker has created', len(pairsSBM), 'pairs.')
50
+
51
+ # If no pairs are found, break
52
+ if len(pairsSBM) == 0: return pd.DataFrame()
53
+
54
+ # Call the compare class from the toolkit
55
+ compareSBM = recordlinkage.Compare()
56
+
57
+ # Assign variables to matching technique - exact
58
+ for columns in exact_columns:
59
+ compareSBM.exact(columns, columns, label = columns, missing_value = 0)
60
+
61
+ # Assign variables to matching technique - fuzzy
62
+ for columns in text_columns:
63
+ if columns == "Postcode":
64
+ compareSBM.string(columns, columns, label = columns, missing_value = 0, method = "levenshtein")
65
+ else:
66
+ compareSBM.string(columns, columns, label = columns, missing_value = 0, method = fuzzy_method)
67
+
68
+ ## Run the match - compare each column within the blocks according to exact or fuzzy matching (defined in cells above)
69
+
70
+ scoresSBM = compareSBM.compute(pairs = pairsSBM, x = predict_df_search, x_link = ref_search)
71
+
72
+ return scoresSBM
73
+
74
+ def calc_final_nnet_scores(scoresSBM, weights, matching_variables):
75
+ #Modify the output scores by the weights set at the start of the code
76
+ scoresSBM_w = scoresSBM*weights
77
+
78
+ ### Determine matched roles that score above a threshold
79
+
80
+ # Sum all columns
81
+ scoresSBM_r = scoresSBM_w
82
+
83
+ scoresSBM_r['score'] = scoresSBM_r[matching_variables].sum(axis = 1)
84
+ scoresSBM_r['score_max'] = sum(weights.values()) # + 2 for the additional scoring from the weighted variables a couple of cells above
85
+ scoresSBM_r['score_perc'] = (scoresSBM_r['score'] / scoresSBM_r['score_max'])*100
86
+
87
+ scoresSBM_r = scoresSBM_r.reset_index()
88
+
89
+ # Rename the index if misnamed
90
+ scoresSBM_r = scoresSBM_r.rename(columns={"index":"level_1"}, errors = "ignore")
91
+
92
+ # Sort all comparisons by score in descending order
93
+ scoresSBM_r = scoresSBM_r.sort_values(by=["level_0","score_perc"], ascending = False)
94
+
95
+ # Within each search address, remove anything below the max
96
+ #scoresSBM_r.to_csv("scoresSBM_r.csv")
97
+ scoresSBM_g = scoresSBM_r.reset_index()
98
+
99
+ # Get maximum score to join on
100
+ scoresSBM_g = scoresSBM_g.groupby("level_0").max("score_perc").reset_index()[["level_0", "score_perc"]]
101
+ scoresSBM_g =scoresSBM_g.rename(columns={"score_perc":"score_perc_max"})
102
+ scoresSBM_search = scoresSBM_r.merge(scoresSBM_g, on = "level_0", how="left")
103
+
104
+ scoresSBM_search['score_perc'] = round(scoresSBM_search['score_perc'],1).astype(float)
105
+ scoresSBM_search['score_perc_max'] = round(scoresSBM_search['score_perc_max'],1).astype(float)
106
+
107
+ return scoresSBM_search
108
+
109
+ def join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search):
110
+ ## Join back search and ref_df address details onto matching df
111
+ scoresSBM_search_m_j = scoresSBM_search_m.merge(ref_search, left_on="level_1", right_index=True, how = "left", suffixes=("", "_ref"))
112
+
113
+ scoresSBM_search_m_j = scoresSBM_search_m_j.merge(predict_df_search, left_on="level_0", right_index=True,how="left", suffixes=("", "_pred"))
114
+
115
+ scoresSBM_search_m_j = scoresSBM_search_m_j.reindex(sorted(scoresSBM_search_m_j.columns), axis=1)
116
+
117
+ #scoresSBM_search_m_j.to_csv("scoresSBM_search_m_j.csv")
118
+
119
+ return scoresSBM_search_m_j
120
+
121
+ def rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise):
122
+
123
+ start_columns = new_join_col.copy()
124
+
125
+ start_columns.extend(["address", "fulladdress", "level_0", "level_1","score","score_max","score_perc","score_perc_max"])
126
+
127
+ other_columns = list(set(scoresSBM_search_m_j.columns) - set(start_columns))
128
+
129
+ all_columns_order = start_columns.copy()
130
+ all_columns_order.extend(sorted(other_columns))
131
+
132
+
133
+ # Place important columns at start
134
+
135
+ scoresSBM_search_m_j = scoresSBM_search_m_j.reindex(all_columns_order, axis=1)
136
+
137
+ scoresSBM_search_m_j = scoresSBM_search_m_j.rename(columns={'address':'address_pred',
138
+ 'fulladdress':'address_ref',
139
+ 'level_0':'index_pred',
140
+ 'level_1':'index_ref',
141
+ 'score':'match_score',
142
+ 'score_max':'max_possible_score',
143
+ 'score_perc':'perc_weighted_columns_matched',
144
+ 'score_perc_max':'perc_weighted_columns_matched_max_for_pred_address'})
145
+
146
+ scoresSBM_search_m_j = scoresSBM_search_m_j.sort_values("index_pred", ascending = True)
147
+
148
+ # ref_index is just a duplicate of index_ref, needed for outputs
149
+ scoresSBM_search_m_j["ref_index"] = scoresSBM_search_m_j["index_ref"]
150
+
151
+ #search_df_j = orig_search_df[["full_address_search", search_df_key_field]]
152
+
153
+ #scoresSBM_out = scoresSBM_search_m_j.merge(search_df_j, left_on = "address_pred", right_on = "full_address_search", how = "left")
154
+
155
+ final_cols = new_join_col.copy()
156
+ final_cols.extend([search_df_key_field, 'full_match_score_based', 'address_pred', 'address_ref',\
157
+ 'match_score', 'max_possible_score', 'perc_weighted_columns_matched',\
158
+ 'perc_weighted_columns_matched_max_for_pred_address',\
159
+ 'Organisation', 'Organisation_ref', 'Organisation_pred',\
160
+ 'SaoText', 'SaoText_ref', 'SaoText_pred',\
161
+ 'SaoStartNumber', 'SaoStartNumber_ref', 'SaoStartNumber_pred',\
162
+ 'SaoStartSuffix', 'SaoStartSuffix_ref', 'SaoStartSuffix_pred',\
163
+ 'SaoEndNumber', 'SaoEndNumber_ref', 'SaoEndNumber_pred',\
164
+ 'SaoEndSuffix', 'SaoEndSuffix_ref', 'SaoEndSuffix_pred',\
165
+ 'PaoStartNumber', 'PaoStartNumber_ref', 'PaoStartNumber_pred',\
166
+ 'PaoStartSuffix', 'PaoStartSuffix_ref', 'PaoStartSuffix_pred',\
167
+ 'PaoEndNumber', 'PaoEndNumber_ref', 'PaoEndNumber_pred',\
168
+ 'PaoEndSuffix', 'PaoEndSuffix_ref', 'PaoEndSuffix_pred',\
169
+ 'PaoText', 'PaoText_ref', 'PaoText_pred',\
170
+ 'Street', 'Street_ref', 'Street_pred',\
171
+ 'PostTown', 'PostTown_ref', 'PostTown_pred',\
172
+ 'Postcode', 'Postcode_ref', 'Postcode_pred', 'Postcode_predict',\
173
+ 'index_pred', 'index_ref', 'Reference file'
174
+ ])
175
+
176
+ scoresSBM_out = scoresSBM_search_m_j[final_cols]
177
+
178
+ #scoresSBM_out.to_csv("scoresSBM_out" + "_" + blocker_column[0] + "_" + str(standardise) + ".csv")
179
+
180
+ return scoresSBM_out, start_columns
181
+
182
+ def create_matched_results_nnet(scoresSBM_best, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off):
183
+
184
+ #scoresSBM_best.to_csv("scores_sbm_best_" + str(standardise) + ".csv")
185
+
186
+ ### Make the final 'matched output' file
187
+ scoresSBM_best_pred_cols = scoresSBM_best.filter(regex='_pred$').iloc[:,1:-1]
188
+ scoresSBM_best["search_orig_address"] = (scoresSBM_best_pred_cols.agg(' '.join, axis=1)).str.strip().str.replace("\s{2,}", " ", regex=True)
189
+
190
+ scoresSBM_best_ref_cols = scoresSBM_best.filter(regex='_ref$').iloc[:,1:-1]
191
+ scoresSBM_best['reference_mod_address'] = (scoresSBM_best_ref_cols.agg(' '.join, axis=1)).str.strip().str.replace("\s{2,}", " ", regex=True)
192
+
193
+ ## Create matched output df
194
+ matched_output_SBM = orig_search_df[[search_df_key_field, "full_address", "postcode", "property_number", "prop_number", "flat_number", "apart_number", "block_number", 'unit_number', "room_number", "house_court_name"]].replace(r"\bnan\b", "", regex=True).infer_objects(copy=False)
195
+ matched_output_SBM[search_df_key_field] = matched_output_SBM[search_df_key_field].astype(str)
196
+
197
+ ###
198
+ matched_output_SBM = matched_output_SBM.merge(scoresSBM_best[[search_df_key_field, 'index_ref','address_ref',
199
+ 'full_match_score_based', 'Reference file']], on = search_df_key_field, how = "left").\
200
+ rename(columns={"full_address":"search_orig_address"})
201
+
202
+ #ref_search.to_csv("ref_search.csv")
203
+
204
+ if 'index' not in ref_search.columns:
205
+ ref_search['ref_index'] = ref_search.index
206
+
207
+ matched_output_SBM = matched_output_SBM.merge(ref_search.drop_duplicates("fulladdress")[["ref_index", "fulladdress", "Postcode", "property_number", "prop_number", "flat_number", "apart_number", "block_number", 'unit_number', "room_number", "house_court_name", "ref_address_stand"]], left_on = "address_ref", right_on = "fulladdress", how = "left", suffixes=('_search', '_reference')).rename(columns={"fulladdress":"reference_orig_address", "ref_address_stand":"reference_list_address"})
208
+
209
+ #matched_output_SBM.to_csv("matched_output_SBM_earlier_" + str(standardise) + ".csv")
210
+
211
+ # To replace with number check
212
+
213
+
214
+ matched_output_SBM = matched_output_SBM.rename(columns={"full_match_score_based":"full_match"})
215
+
216
+ matched_output_SBM['property_number_match'] = matched_output_SBM['full_match']
217
+ #
218
+
219
+ scores_SBM_best_cols = [search_df_key_field, 'full_match_score_based', 'perc_weighted_columns_matched', 'address_pred']#, "reference_mod_address"]
220
+ scores_SBM_best_cols.extend(new_join_col)
221
+
222
+ matched_output_SBM_b = scoresSBM_best[scores_SBM_best_cols]
223
+
224
+ matched_output_SBM = matched_output_SBM.merge(matched_output_SBM_b.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
225
+
226
+ #matched_output_SBM.to_csv("matched_output_SBM_later_" + str(standardise) + ".csv")
227
+
228
+ from tools.fuzzy_match import create_diag_shortlist
229
+ matched_output_SBM = create_diag_shortlist(matched_output_SBM, "search_orig_address", score_cut_off, blocker_column, fuzzy_col='perc_weighted_columns_matched', search_mod_address="address_pred", resolve_tie_breaks=False)
230
+
231
+ #matched_output_SBM.to_csv("matched_output_after.csv")
232
+
233
+ #matched_output_SBM["UPRN"] = scoresSBM_best['UPRN']
234
+
235
+ matched_output_SBM['standardised_address'] = standardise
236
+
237
+ matched_output_SBM = matched_output_SBM.rename(columns={"address_pred":"search_mod_address",
238
+ #"address_ref":"reference_orig_address",
239
+ #"full_match_score_based":"fuzzy_score_match",
240
+ 'perc_weighted_columns_matched':"fuzzy_score"})
241
+
242
+ matched_output_SBM_cols = [search_df_key_field, 'search_orig_address','reference_orig_address',
243
+ 'full_match',
244
+ 'full_number_match',
245
+ 'flat_number_match',
246
+ 'room_number_match',
247
+ 'block_number_match',
248
+ 'property_number_match',
249
+ 'close_postcode_match',
250
+ 'house_court_name_match',
251
+ 'fuzzy_score_match',
252
+ "fuzzy_score",
253
+ 'property_number_search', 'property_number_reference',
254
+ 'flat_number_search', 'flat_number_reference',
255
+ 'room_number_search', 'room_number_reference',
256
+ 'block_number_search', 'block_number_reference',
257
+ "unit_number_search","unit_number_reference",
258
+ 'house_court_name_search', 'house_court_name_reference',
259
+ "search_mod_address", 'reference_mod_address','Postcode', 'postcode', 'ref_index', 'Reference file']
260
+
261
+ #matched_output_SBM_cols = [search_df_key_field, 'search_orig_address', 'reference_orig_address',
262
+ #'full_match', 'fuzzy_score_match', 'property_number_match', 'full_number_match',
263
+ #'fuzzy_score', 'search_mod_address', 'reference_mod_address', 'Reference file']
264
+
265
+ matched_output_SBM_cols.extend(new_join_col)
266
+ matched_output_SBM_cols.extend(['standardised_address'])
267
+ matched_output_SBM = matched_output_SBM[matched_output_SBM_cols]
268
+
269
+ matched_output_SBM = matched_output_SBM.sort_values(search_df_key_field, ascending=True)
270
+
271
+ #matched_output_SBM.to_csv("matched_output_SBM_out.csv")
272
+
273
+ return matched_output_SBM
274
+
275
+ def score_based_match(predict_df_search, ref_search, orig_search_df, matching_variables, text_columns, blocker_column, weights, fuzzy_method, score_cut_off, search_df_key_field, standardise, new_join_col, score_cut_off_nnet_street=score_cut_off_nnet_street):
276
+
277
+ scoresSBM = compute_match(predict_df_search, ref_search, orig_search_df, matching_variables, text_columns, blocker_column, weights, fuzzy_method)
278
+
279
+ if scoresSBM.empty:
280
+ # If no pairs are found, break
281
+ return pd.DataFrame(), pd.DataFrame()
282
+
283
+ scoresSBM_search = calc_final_nnet_scores(scoresSBM, weights, matching_variables)
284
+
285
+ # Filter potential matched address scores to those with highest scores only
286
+ scoresSBM_search_m = scoresSBM_search[scoresSBM_search["score_perc"] == scoresSBM_search["score_perc_max"]]
287
+
288
+ scoresSBM_search_m_j = join_on_pred_ref_details(scoresSBM_search_m, ref_search, predict_df_search)
289
+
290
+ #scoresSBM_search_m_j.to_csv("scoresSBM_search_m_j.csv")
291
+
292
+ # When blocking by street, may to have an increased threshold as this is more prone to making mistakes
293
+ if blocker_column[0] == "Street": scoresSBM_search_m_j['full_match_score_based'] = (scoresSBM_search_m_j['score_perc'] >= score_cut_off_nnet_street)
294
+
295
+ else: scoresSBM_search_m_j['full_match_score_based'] = (scoresSBM_search_m_j['score_perc'] >= score_cut_off)
296
+
297
+ ### Reorder some columns
298
+ scoresSBM_out, start_columns = rearrange_columns(scoresSBM_search_m_j, new_join_col, search_df_key_field, blocker_column, standardise)
299
+
300
+ #scoresSBM_out.to_csv("scoresSBM_out.csv")
301
+
302
+ matched_output_SBM = create_matched_results_nnet(scoresSBM_out, search_df_key_field, orig_search_df, new_join_col, standardise, ref_search, blocker_column, score_cut_off)
303
+
304
+ matched_output_SBM_best = matched_output_SBM.sort_values([search_df_key_field, "full_match"], ascending = [True, False]).drop_duplicates(search_df_key_field)
305
+
306
+ #matched_output_SBM.to_csv("matched_output_SBM.csv")
307
+ #matched_output_SBM_best.to_csv("matched_output_SBM_best.csv")
308
+
309
+ scoresSBM_best = scoresSBM_out[scoresSBM_out[search_df_key_field].isin(matched_output_SBM_best[search_df_key_field])]
310
+
311
+ return scoresSBM_best, matched_output_SBM_best
312
+
313
+ def check_matches_against_fuzzy(match_results, scoresSBM, search_df_key_field):
314
+
315
+ if not match_results.empty:
316
+
317
+ if 'fuzz_full_match' not in match_results.columns:
318
+ match_results['fuzz_full_match'] = False
319
+
320
+ match_results = match_results.add_prefix("fuzz_").rename(columns={"fuzz_"+search_df_key_field:search_df_key_field})
321
+
322
+ #Merge fuzzy match full matches onto model data
323
+
324
+ scoresSBM_m = scoresSBM.merge(match_results.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
325
+
326
+ else:
327
+ scoresSBM_m = scoresSBM
328
+ scoresSBM_m["fuzz_full_match"] = False
329
+ scoresSBM_m['fuzz_fuzzy_score_match'] = False
330
+ scoresSBM_m['fuzz_property_number_match'] = False
331
+ scoresSBM_m['fuzz_fuzzy_score'] = 0
332
+ scoresSBM_m['fuzz_reference_orig_address'] = ""
333
+
334
+ scoresSBM_t = scoresSBM[scoresSBM["full_match_score_based"]==True]
335
+
336
+ ### Create a df of matches the model finds that the fuzzy matching work did not
337
+
338
+ scoresSBM_m_model_add_matches = scoresSBM_m[(scoresSBM_m["full_match_score_based"] == True) &\
339
+ (scoresSBM_m["fuzz_full_match"] == False)]
340
+
341
+ # Drop some irrelevant columns
342
+
343
+ first_cols = ['UPRN', search_df_key_field, 'full_match_score_based', 'fuzz_full_match', 'fuzz_fuzzy_score_match', 'fuzz_property_number_match',\
344
+ 'fuzz_fuzzy_score', 'match_score', 'max_possible_score', 'perc_weighted_columns_matched',\
345
+ 'perc_weighted_columns_matched_max_for_pred_address', 'address_pred',\
346
+ 'address_ref', 'fuzz_reference_orig_address']
347
+
348
+ last_cols = [col for col in scoresSBM_m_model_add_matches.columns if col not in first_cols]
349
+
350
+ scoresSBM_m_model_add_matches = scoresSBM_m_model_add_matches[first_cols+last_cols].drop(['fuzz_search_mod_address',
351
+ 'fuzz_reference_mod_address', 'fuzz_fulladdress', 'fuzz_UPRN'], axis=1, errors="ignore")
352
+
353
+ ### Create a df for matches the fuzzy matching found that the neural net model does not
354
+
355
+ if not match_results.empty:
356
+ scoresSBM_t_model_failed = match_results[(~match_results[search_df_key_field].isin(scoresSBM_t[search_df_key_field])) &\
357
+ (match_results["fuzz_full_match"] == True)]
358
+
359
+ scoresSBM_t_model_failed = scoresSBM_t_model_failed.\
360
+ merge(scoresSBM.drop_duplicates(search_df_key_field), on = search_df_key_field, how = "left")
361
+
362
+ scoresSBM_t_model_failed = scoresSBM_t_model_failed[first_cols+last_cols].drop(['fuzz_search_mod_address',
363
+ 'fuzz_reference_mod_address', 'fuzz_fulladdress', 'fuzz_UPRN'], axis=1, errors="ignore")
364
+ else:
365
+ scoresSBM_t_model_failed = pd.DataFrame()
366
+
367
+ ## Join back onto original results file and export
368
+
369
+ scoresSBM_new_matches_from_model = scoresSBM_m_model_add_matches.drop_duplicates(search_df_key_field)
370
+
371
+ if not match_results.empty:
372
+ match_results_out = match_results.merge(scoresSBM_new_matches_from_model[[search_df_key_field, 'full_match_score_based', 'address_pred',
373
+ 'address_ref']], on = search_df_key_field, how = "left")
374
+
375
+ match_results_out.loc[match_results_out['full_match_score_based'].isna(),'full_match_score_based'] = False
376
+
377
+ #match_results_out['full_match_score_based'].value_counts()
378
+
379
+ match_results_out["full_match_fuzzy_or_score_based"] = (match_results_out["fuzz_full_match"] == True) |\
380
+ (match_results_out["full_match_score_based"] == True)
381
+ else: match_results_out = match_results
382
+
383
+ return scoresSBM_m_model_add_matches, scoresSBM_t_model_failed, match_results_out
384
+
tools/standardise.py ADDED
@@ -0,0 +1,722 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ from typing import Type, Dict, List, Tuple
5
+ from datetime import datetime
6
+ import warnings
7
+ warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
8
+
9
+ PandasDataFrame = Type[pd.DataFrame]
10
+ PandasSeries = Type[pd.Series]
11
+ MatchedResults = Dict[str,Tuple[str,int]]
12
+ array = List[str]
13
+
14
+ today = datetime.now().strftime("%d%m%Y")
15
+ today_rev = datetime.now().strftime("%Y%m%d")
16
+
17
+ # # Standardisation functions
18
+
19
+ def standardise_wrapper_func(search_df_cleaned:PandasDataFrame, ref_df_cleaned:PandasDataFrame,\
20
+ standardise = False, filter_to_lambeth_pcodes = True, match_task = "fuzzy"):
21
+ '''
22
+ Initial standardisation of search and reference dataframes before passing addresses and postcodes to the main standardisation function
23
+ '''
24
+
25
+ ## Search df - lower case addresses, replace spaces in postcode
26
+ search_df_cleaned["full_address_search"] = search_df_cleaned["full_address"].str.lower().str.strip()
27
+ search_df_cleaned['postcode_search'] = search_df_cleaned['postcode'].str.lower().str.strip().str.replace("\s+", "", regex=True)
28
+
29
+ # Filter out records where 'Excluded from search' is not a postal address by making the postcode blank
30
+ search_df_cleaned.loc[search_df_cleaned['Excluded from search'] == "Excluded - non-postal address", 'postcode_search'] = ""
31
+
32
+ # Remove nulls from ref_df postcode
33
+ ref_df_cleaned = ref_df_cleaned[ref_df_cleaned['Postcode'].notna()]
34
+
35
+ ref_df_cleaned["full_address_search"] = ref_df_cleaned["fulladdress"].str.lower().str.strip()
36
+ ref_df_cleaned['postcode_search'] = ref_df_cleaned['Postcode'].str.lower().str.strip().str.replace("\s+", "", regex=True)
37
+
38
+ # Block only on first 5 characters of postcode string - Doesn't give more matches and makes everything a bit slower
39
+ #search_df_cleaned['postcode_search'] = search_df_cleaned['postcode_search'].str[:-1]
40
+ #ref_df_cleaned['postcode_search'] = ref_df_cleaned['postcode_search'].str[:-1]
41
+
42
+
43
+ ### Use standardise function
44
+
45
+ ### Remove 'non-housing' places from the list - not included as want to check all
46
+ #search_df_after_stand = remove_non_housing(search_df_cleaned, 'full_address_search')
47
+ search_df_after_stand = standardise_address(search_df_cleaned, "full_address_search", "search_address_stand", standardise = standardise, out_london = True)
48
+
49
+ ## Standardise ref_df addresses
50
+
51
+ if match_task == "fuzzy":
52
+ ref_df_after_stand = standardise_address(ref_df_cleaned, "full_address_search", "ref_address_stand", standardise = standardise, out_london = True)
53
+ else:
54
+ # For the neural net matching, I didn't find that standardising the reference addresses helped at all, in fact it made things worse. So reference addresses are not standardised at this step.
55
+ ref_df_after_stand = standardise_address(ref_df_cleaned, "full_address_search", "ref_address_stand", standardise = False, out_london = True)
56
+
57
+
58
+ return search_df_after_stand, ref_df_after_stand#, search_df_after_stand_series, ref_df_after_stand_series
59
+
60
+ def standardise_address(df:PandasDataFrame, col:str, out_col:str, standardise:bool = True, out_london = True) -> PandasDataFrame:
61
+
62
+ '''
63
+ This function takes a 'full address' column and then standardises so that extraneous
64
+ information is removed (i.e. postcodes & London, as this algorithm is used for London
65
+ addresses only), and so that room/flat/property numbers can be accurately extracted. The
66
+ standardised addresses can then be used for the fuzzy matching functions later in this
67
+ notebook.
68
+
69
+ The function does the following:
70
+
71
+ - Removes the post code and 'london' (if not dealing with addresses outside of london)
72
+ from the address to reduce the text the algorithm has to search.
73
+ Postcode removal uses regex to extract a UK postcode.
74
+
75
+ - Remove the word 'flat' or 'apartment' from an address that has only one number in it
76
+
77
+ - Add 'flat' to the start of any address that contains 'house' or 'court' (which are generally housing association buildings)
78
+ This is because in the housing list, these addresses never have the word flat in front of them
79
+
80
+ - Replace any addresses that don't have a space between the comma and the next word or double spaces
81
+
82
+ - Replace 'number / number' and 'number-number' with 'number' (the first number in pair)
83
+
84
+ - Add 'flat' to the start of addresses that include ground floor/first floor etc. flat
85
+ in the text. Replace with flat a,b,c etc.
86
+
87
+ - Pull out property, flat, and room numbers from the address text
88
+
89
+ - Return the data frame with the new columns included
90
+
91
+ '''
92
+
93
+ df_copy = df.copy(deep=True)
94
+
95
+ # Trim the address to remove leading and tailing spaces
96
+ df_copy[col] = df_copy[col].str.strip()
97
+
98
+ ''' Remove the post code and 'london' from the address to reduce the text the algorithm has to search
99
+ Using a regex to extract a UK postcode. I got the regex from the following. Need to replace their \b in the solution with \\b
100
+ https://stackoverflow.com/questions/51828712/r-regular-expression-for-extracting-uk-postcode-from-an-address-is-not-ordered
101
+
102
+ The following will pick up whole postcodes, postcodes with just the first part, and postcodes with the first
103
+ part and the first number of the second half
104
+ '''
105
+
106
+
107
+ df_copy['add_no_pcode'] = remove_postcode(df_copy, col)
108
+
109
+ if out_london == False:
110
+ df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.replace("london","").str.replace(r",,|, ,","", regex=True)
111
+
112
+ # If the user wants to standardise the address
113
+ if standardise:
114
+
115
+ df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.lower()
116
+
117
+ # If there are dates at the start of the address, change this
118
+ df_copy['add_no_pcode'] = replace_mistaken_dates(df_copy, 'add_no_pcode')
119
+
120
+ # Replace flat name variations with flat, abbreviations with full name of item (e.g. rd to road)
121
+ df_copy['add_no_pcode'] = df_copy['add_no_pcode'].str.replace(r"\brd\b","road", regex=True).\
122
+ str.replace(r"\bst\b","street", regex=True).\
123
+ str.replace(r"\bave\b","avenue", regex=True).\
124
+ str.replace("'", "", regex=False).\
125
+ str.replace(r"\bat\b ", " ",regex=True).\
126
+ str.replace("apartment", "flat",regex=False).\
127
+ str.replace("studio flat", "flat",regex=False).\
128
+ str.replace("cluster flat", "flats",regex=False).\
129
+ str.replace(r"\bflr\b", "floor", regex=True).\
130
+ str.replace(r"\bflrs\b", "floors", regex=True).\
131
+ str.replace(r"\blwr\b", "lower", regex=True).\
132
+ str.replace(r"\bgnd\b", "ground", regex=True).\
133
+ str.replace(r"\blgnd\b", "lower ground", regex=True).\
134
+ str.replace(r"\bgrd\b", "ground", regex=True).\
135
+ str.replace(r"\bmais\b", "flat", regex=True).\
136
+ str.replace(r"\bmaisonette\b", "flat", regex=True).\
137
+ str.replace(r"\bpt\b", "penthouse", regex=True).\
138
+ str.replace(r"\bbst\b","basement", regex=True).\
139
+ str.replace(r"\bbsmt\b","basement", regex=True)
140
+
141
+ df_copy["add_no_pcode_house"] = move_flat_house_court(df_copy)
142
+
143
+ # Replace any addresses that don't have a space between the comma and the next word. and double spaces # df_copy['add_no_pcode_house']
144
+ df_copy['add_no_pcode_house_comma'] = df_copy['add_no_pcode_house'].str.replace(r',(\w)', r', \1', regex=True).str.replace(' ', ' ', regex=False)
145
+
146
+ # Replace number / number and number-number with number
147
+ df_copy['add_no_pcode_house_comma_no'] = df_copy['add_no_pcode_house_comma'].str.replace(r'(\d+)\/(\d+)', r'\1', regex=True\
148
+ ).str.replace(r'(\d+)-(\d+)', r'\1', regex=True\
149
+ ).str.replace(r'(\d+) - (\d+)', r'\1', regex=True)
150
+
151
+ # Add 'flat' to the start of addresses that include ground/first/second etc. floor flat in the text
152
+ df_copy['floor_replacement'] = replace_floor_flat(df_copy, 'add_no_pcode_house_comma_no')
153
+ df_copy['flat_added_to_start_addresses_begin_letter'] = add_flat_addresses_start_with_letter(df_copy, 'floor_replacement')
154
+
155
+ df_copy[out_col] = merge_series(df_copy['add_no_pcode_house_comma_no'], df_copy['flat_added_to_start_addresses_begin_letter'])
156
+
157
+ # Write stuff back to the original df
158
+ df[out_col] = df_copy[out_col]
159
+
160
+ else:
161
+ df_copy[out_col] = df_copy['add_no_pcode']
162
+ df[out_col] = df_copy['add_no_pcode']
163
+
164
+ ## POST STANDARDISATION CLEANING AND INFORMATION EXTRACTION
165
+ # Remove trailing spaces
166
+ df[out_col] = df[out_col].str.strip()
167
+
168
+ # Pull out property, flat, and room numbers from the address text
169
+ df['property_number'] = extract_prop_no(df_copy, out_col)
170
+
171
+ # Extract flat, apartment numbers
172
+ df = extract_flat_and_other_no(df, out_col)
173
+
174
+ df['flat_number'] = merge_series(df['flat_number'], df['apart_number'])
175
+ df['flat_number'] = merge_series(df['flat_number'], df['prop_number'])
176
+ df['flat_number'] = merge_series(df['flat_number'], df['first_sec_number'])
177
+ df['flat_number'] = merge_series(df['flat_number'], df['first_letter_flat_number'])
178
+ df['flat_number'] = merge_series(df['flat_number'], df['first_letter_no_more_numbers'])
179
+
180
+ # Extract room numbers
181
+ df['room_number'] = extract_room_no(df, out_col)
182
+
183
+ # Extract block and unit names
184
+ df = extract_block_and_unit_name(df, out_col)
185
+
186
+ # Extract house or court name
187
+ df['house_court_name'] = extract_house_or_court_name(df, out_col)
188
+
189
+ return df
190
+
191
+ def move_flat_house_court(df:PandasDataFrame):
192
+ ''' Remove 'flat' from any address that contains 'house' or 'court'
193
+ From the df address, remove the word 'flat' from any address that contains the word 'house' or 'court'
194
+ This is because in the housing list, these addresses never have the word flat in front of them
195
+ '''
196
+
197
+ # Remove the word flat or apartment from addresses that have only one number in it. 'Flat' will be re-added later to relevant addresses
198
+ # that need it (replace_floor_flat)
199
+ df['flat_removed'] = remove_flat_one_number_address(df, 'add_no_pcode')
200
+
201
+
202
+
203
+ remove_flat_house = df['flat_removed'].str.lower().str.contains(r"\bhouse\b")#(?=\bhouse\b)(?!.*house road)")
204
+ remove_flat_court = df['flat_removed'].str.lower().str.contains(r"\bcourt\b")#(?=\bcourt\b)(?!.*court road)")
205
+ remove_flat_terrace = df['flat_removed'].str.lower().str.contains(r"\bterrace\b")#(?=\bterrace\b)(?!.*terrace road)")
206
+ remove_flat_house_or_court = (remove_flat_house | remove_flat_court | remove_flat_terrace == 1)
207
+
208
+ df['remove_flat_house_or_court'] = remove_flat_house_or_court
209
+
210
+ # Assuming 'df' is your DataFrame
211
+ df = df[~df.index.duplicated(keep='first')]
212
+
213
+ df['house_court_replacement'] = "flat " + df.loc[df['remove_flat_house_or_court'] == True, 'flat_removed'].str.replace(r"\bflat\b","", regex=True).str.strip().map(str)
214
+
215
+ #df["add_no_pcode_house"] = merge_columns(df, "add_no_pcode_house", 'flat_removed', "house_court_replacement")
216
+
217
+ #merge_columns(df, "new_col", col1, 'letter_after_number')
218
+ df["add_no_pcode_house"] = merge_series(df['flat_removed'], df["house_court_replacement"])
219
+
220
+ return df["add_no_pcode_house"]
221
+
222
+ def extract_street_name(address:str) -> str:
223
+ """
224
+ Extracts the street name from the given address.
225
+
226
+ Args:
227
+ address (str): The input address string.
228
+
229
+ Returns:
230
+ str: The extracted street name, or an empty string if no match is found.
231
+
232
+ Examples:
233
+ >>> address1 = "1 Ash Park Road SE54 3HB"
234
+ >>> extract_street_name(address1)
235
+ 'Ash Park Road'
236
+
237
+ >>> address2 = "Flat 14 1 Ash Park Road SE54 3HB"
238
+ >>> extract_street_name(address2)
239
+ 'Ash Park Road'
240
+
241
+ >>> address3 = "123 Main Blvd"
242
+ >>> extract_street_name(address3)
243
+ 'Main Blvd'
244
+
245
+ >>> address4 = "456 Maple AvEnUe"
246
+ >>> extract_street_name(address4)
247
+ 'Maple AvEnUe'
248
+
249
+ >>> address5 = "789 Oak Street"
250
+ >>> extract_street_name(address5)
251
+ 'Oak Street'
252
+ """
253
+
254
+
255
+
256
+ street_types = [
257
+ 'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
258
+ 'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
259
+ 'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
260
+ 'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
261
+ 'Alley', 'Arcade','Avenue', 'Ave','Bay','Bend','Brae','Byway','Close','Corner','Cove',
262
+ 'Crescent', 'Cres','Cul-de-sac','Dell','Drive', 'Dr','Esplanade','Glen','Green','Grove','Heights', 'Hts',
263
+ 'Mews','Parade','Path','Piazza','Promenade','Quay','Ridge','Row','Terrace', 'Ter','Track','Trail','View','Villas',
264
+ 'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
265
+ ]
266
+
267
+ # Dynamically construct the regex pattern with all possible street types
268
+ street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
269
+
270
+ # The overall regex pattern to capture the street name
271
+ pattern = rf'(?:\d+\s+|\w+\s+\d+\s+|.*\d+[a-z]+\s+|.*\d+\s+)*(?P<street_name>[\w\s]+(?:{street_types_pattern}))'
272
+
273
+ def replace_postcode(address):
274
+ pattern = r'\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$'
275
+ return re.sub(pattern, "", address)
276
+
277
+
278
+ modified_address = replace_postcode(address.upper())
279
+ #print(modified_address)
280
+ #print(address)
281
+
282
+ # Perform a case-insensitive search
283
+ match = re.search(pattern, modified_address, re.IGNORECASE)
284
+
285
+ if match:
286
+ street_name = match.group('street_name')
287
+ return street_name.strip()
288
+ else:
289
+ return ""
290
+
291
+ def remove_flat_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
292
+
293
+ '''
294
+ If there is only one number in the address, and there is no letter after the number,
295
+ remove the word flat from the address
296
+ '''
297
+
298
+ df['contains_letter_after_number'] = df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)", regex = True)
299
+ df['contains_single_letter_before_number'] = df[col1].str.lower().str.contains(r'\b[A-Za-z]\b[^\d]* \d', regex = True)
300
+ df['two_numbers_in_address'] = df[col1].str.lower().str.contains(r"(?:\d+.*?)[^a-zA-Z0-9_].*?\d+", regex = True)
301
+ df['contains_apartment'] = df[col1].str.lower().str.contains(r"\bapartment\b \w+|\bapartments\b \w+", "", regex = True)
302
+ df['contains_flat'] = df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+", "", regex = True)
303
+ df['contains_room'] = df[col1].str.lower().str.contains(r"\broom\b \w+|\brooms\b \w+", "", regex = True)
304
+
305
+
306
+ df['selected_rows'] = (df['contains_letter_after_number'] == False) &\
307
+ (df['two_numbers_in_address'] == False) &\
308
+ (df['contains_single_letter_before_number'] == False) &\
309
+ ((df['contains_flat'] == True) |\
310
+ (df['contains_apartment'] == True) |\
311
+ (df['contains_room'] == True))
312
+
313
+ df['one_number_no_flat'] = df[df['selected_rows'] == True][col1]
314
+ df['one_number_no_flat'] = df['one_number_no_flat'].str.replace(r"(\bapartment\b)|(\bapartments\b)", "", regex=True).str.replace(r"(\bflat\b)|(\bflats\b)", "", regex=True).str.replace(r"(\broom\b)|(\brooms\b)", "", regex=True)
315
+
316
+ df["new_col"] = merge_series(df[col1], df["one_number_no_flat"])
317
+
318
+ return df['new_col']
319
+
320
+ def add_flat_addresses_start_with_letter(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
321
+ '''
322
+ Add the word flat to addresses that start with a letter.
323
+ '''
324
+
325
+ df['contains_single_letter_at_start_before_number'] = df[col1].str.lower().str.contains(r'^\b[A-Za-z]\b[^\d]* \d', regex = True)
326
+
327
+ df['selected_rows'] = (df['contains_single_letter_at_start_before_number'] == True)
328
+ df['flat_added_to_string_start'] = "flat " + df[df['selected_rows'] == True][col1]
329
+
330
+ #merge_columns(df, "new_col", col1, 'flat_added_to_string_start')
331
+ df["new_col"] = merge_series(df[col1], df['flat_added_to_string_start'])
332
+
333
+
334
+ return df['new_col']
335
+
336
+ def extract_letter_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
337
+ '''
338
+ This function looks for addresses that have a letter after a number, but ONLY one number
339
+ in the string, and doesn't already have a flat, apartment, or room number.
340
+
341
+ It then extracts this letter and returns this.
342
+
343
+ This is for addresses such as '2b sycamore road', changes it to
344
+ flat b 2 sycamore road so that 'b' is selected as the flat number
345
+
346
+
347
+ '''
348
+
349
+ df['contains_no_numbers_without_letter'] = df[col1].str.lower().str.contains(r"^(?:(?!\d+ ).)*$")
350
+ df['contains_letter_after_number'] = df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)")
351
+ df['contains_apartment'] = df[col1].str.lower().str.contains(r"\bapartment\b \w+|\bapartments\b \w+", "")
352
+ df['contains_flat'] = df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+", "")
353
+ df['contains_room'] = df[col1].str.lower().str.contains(r"\broom\b \w+|\brooms\b \w+", "")
354
+
355
+ df['selected_rows'] = (df['contains_no_numbers_without_letter'] == True) &\
356
+ (df['contains_letter_after_number'] == True) &\
357
+ (df['contains_flat'] == False) &\
358
+ (df['contains_apartment'] == False) &\
359
+ (df['contains_room'] == False)
360
+
361
+ df['extract_letter'] = df[(df['selected_rows'] == True)\
362
+ ][col1].str.extract(r"\d+([a-z]|[A-Z])")
363
+
364
+ df['extract_number'] = df[(df['selected_rows'] == True)\
365
+ ][col1].str.extract(r"(\d+)[a-z]|[A-Z]")
366
+
367
+
368
+ df['letter_after_number'] = "flat " +\
369
+ df[(df['selected_rows'] == True)\
370
+ ]['extract_letter'] +\
371
+ " " +\
372
+ df[(df['selected_rows'] == True)\
373
+ ]['extract_number'] +\
374
+ " " +\
375
+ df[(df['selected_rows'])\
376
+ ][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\d+([a-z]|[A-Z])","", regex=True).map(str)
377
+
378
+ #merge_columns(df, "new_col", col1, 'letter_after_number')
379
+ df["new_col"] = merge_series(df[col1], df['letter_after_number'])
380
+
381
+ return df['new_col']
382
+
383
+ # def extract_letter_one_number_address(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
384
+ # '''
385
+ # This function extracts a letter after a single number in an address, excluding cases with existing flat, apartment, or room numbers.
386
+ # It transforms addresses like '2b sycamore road' to 'flat b 2 sycamore road' to designate 'b' as the flat number.
387
+ # '''
388
+
389
+ # df['selected_rows'] = (df[col1].str.lower().str.contains(r"^(?:(?!\d+ ).)*$") & \
390
+ # df[col1].str.lower().str.contains(r"\d+(?:[a-z]|[A-Z])(?!.*\d+)") & \
391
+ # ~df[col1].str.lower().str.contains(r"\bflat\b \w+|\bflats\b \w+|\bapartment\b \w+|\bapartments\b \w+|\broom\b \w+|\brooms\b \w+"))
392
+
393
+ # df['extract_letter'] = df.loc[df['selected_rows'], col1].str.extract(r"\d+([a-z]|[A-Z])")
394
+ # df['extract_number'] = df.loc[df['selected_rows'], col1].str.extract(r"(\d+)[a-z]|[A-Z]")
395
+
396
+ # df['letter_after_number'] = "flat " + df['extract_letter'] + " " + df['extract_number'] + " " + \
397
+ # df.loc[df['selected_rows'], col1].str.replace(r"\bflat\b", "", regex=True).str.replace(r"\d+([a-z]|[A-Z])", "", regex=True).map(str)
398
+
399
+ # df["new_col"] = df[col1].copy()
400
+ # df.loc[df['selected_rows'], "new_col"] = df['letter_after_number']
401
+
402
+ # return df['new_col']
403
+
404
+ def replace_floor_flat(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
405
+ ''' In references to basement, ground floor, first floor, second floor, and top floor
406
+ flats, this function moves the word 'flat' to the front of the address. This is so that the
407
+ following word (e.g. basement, ground floor) is recognised as the flat number in the 'extract_flat_and_other_no' function
408
+ '''
409
+
410
+ df['letter_after_number'] = extract_letter_one_number_address(df, col1)
411
+
412
+
413
+ df['basement'] = "flat basement" + df[df[col1].str.lower().str.contains(r"basement"\
414
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement\b","", regex=True).map(str)
415
+
416
+
417
+ df['ground_floor'] = "flat a " + df[df[col1].str.lower().str.contains(r"\bground floor\b"\
418
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground floor\b","", regex=True).map(str)
419
+
420
+ df['first_floor'] = "flat b " + df[df[col1].str.lower().str.contains(r"\bfirst floor\b"\
421
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bfirst floor\b","", regex=True).map(str)
422
+
423
+ df['ground_and_first_floor'] = "flat ab " + df[df[col1].str.lower().str.contains(r"\bground and first floor\b"\
424
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground and first floor\b","", regex=True).map(str)
425
+
426
+ df['basement_ground_and_first_floor'] = "flat basementab " + df[df[col1].str.lower().str.contains(r"\bbasement ground and first floors\b"\
427
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement and ground and first floors\b","", regex=True).map(str)
428
+
429
+ df['basement_ground_and_first_floor2'] = "flat basementab " + df[df[col1].str.lower().str.contains(r"\bbasement ground and first floors\b"\
430
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bbasement ground and first floors\b","", regex=True).map(str)
431
+
432
+ df['second_floor'] = "flat c " + df[df[col1].str.lower().str.contains(r"\bsecond floor\b"\
433
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bsecond floor\b","", regex=True).map(str)
434
+
435
+ df['first_and_second_floor'] = "flat bc " + df[df[col1].str.lower().str.contains(r"\bfirst and second floor\b"\
436
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bfirst and second floor\b","", regex=True).map(str)
437
+
438
+ df['first1_floor'] = "flat b " + df[df[col1].str.lower().str.contains(r"\b1st floor\b"\
439
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b1st floor\b","", regex=True).map(str)
440
+
441
+ df['second2_floor'] = "flat c " + df[df[col1].str.lower().str.contains(r"\b2nd floor\b"\
442
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b2nd floor\b","", regex=True).map(str)
443
+
444
+ df['ground_first_second_floor'] = "flat abc " + df[df[col1].str.lower().str.contains(r"\bground and first and second floor\b"\
445
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bground and first and second floor\b","", regex=True).map(str)
446
+
447
+ df['third_floor'] = "flat d " + df[df[col1].str.lower().str.contains(r"\bthird floor\b"\
448
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\bthird floor\b","", regex=True).map(str)
449
+
450
+ df['third3_floor'] = "flat d " + df[df[col1].str.lower().str.contains(r"\b3rd floor\b"\
451
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\b3rd floor\b","", regex=True).map(str)
452
+
453
+ df['top_floor'] = "flat top " + df[df[col1].str.lower().str.contains(r"\btop floor\b"\
454
+ )][col1].str.replace(r"\bflat\b","", regex=True).str.replace(r"\btop floor\b","", regex=True).map(str)
455
+
456
+ #merge_columns(df, "new_col", col1, 'letter_after_number')
457
+ df["new_col"] = merge_series(df[col1], df['letter_after_number'])
458
+ df["new_col"] = merge_series(df["new_col"], df['basement'])
459
+ df["new_col"] = merge_series(df["new_col"], df['ground_floor'])
460
+ df["new_col"] = merge_series(df["new_col"], df['first_floor'])
461
+ df["new_col"] = merge_series(df["new_col"], df['first1_floor'])
462
+ df["new_col"] = merge_series(df["new_col"], df['ground_and_first_floor'])
463
+ df["new_col"] = merge_series(df["new_col"], df['basement_ground_and_first_floor'])
464
+ df["new_col"] = merge_series(df["new_col"], df['basement_ground_and_first_floor2'])
465
+ df["new_col"] = merge_series(df["new_col"], df['second_floor'])
466
+ df["new_col"] = merge_series(df["new_col"], df['second2_floor'])
467
+ df["new_col"] = merge_series(df["new_col"], df['first_and_second_floor'])
468
+ df["new_col"] = merge_series(df["new_col"], df['ground_first_second_floor'])
469
+ df["new_col"] = merge_series(df["new_col"], df['third_floor'])
470
+ df["new_col"] = merge_series(df["new_col"], df['third3_floor'])
471
+ df["new_col"] = merge_series(df["new_col"], df['top_floor'])
472
+
473
+ return df['new_col']
474
+
475
+ # def replace_floor_flat(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
476
+ # '''Moves the word 'flat' to the front of addresses with floor references like basement, ground floor, etc.'''
477
+
478
+ # floor_mapping = {
479
+ # 'basement': 'basement',
480
+ # 'ground floor': 'a',
481
+ # 'first floor': 'b',
482
+ # 'ground and first floor': 'ab',
483
+ # 'basement ground and first floors': 'basementab',
484
+ # 'second floor': 'c',
485
+ # 'first and second floor': 'bc',
486
+ # '1st floor': 'b',
487
+ # '2nd floor': 'c',
488
+ # 'ground and first and second floor': 'abc',
489
+ # 'third floor': 'd',
490
+ # '3rd floor': 'd',
491
+ # 'top floor': 'top'
492
+ # }
493
+
494
+ # for key, value in floor_mapping.items():
495
+ # df[key] = f"flat {value} " + df[df[col1].str.lower().str.contains(fr"\b{key}\b")][col1].str.replace(r"\bflat\b", "", regex=True).str.replace(fr"\b{key}\b", "", regex=True).map(str)
496
+
497
+ # df["new_col"] = df[col1].copy()
498
+
499
+ # for key in floor_mapping.keys():
500
+ # df["new_col"] = merge_series(df["new_col"], df[key])
501
+
502
+ # return df["new_col"]
503
+
504
+
505
+ def remove_non_housing(df:PandasDataFrame, col1:PandasSeries) -> PandasDataFrame:
506
+ '''
507
+ Remove items from the housing list that are not housing. Includes addresses including
508
+ the text 'parking', 'garage', 'store', 'visitor bay', 'visitors room', and 'bike rack',
509
+ 'yard', 'workshop'
510
+ '''
511
+ df_copy = df.copy()[~df[col1].str.lower().str.contains(\
512
+ r"parking|garage|\bstore\b|\bstores\b|\bvisitor bay\b|visitors room|\bbike rack\b|\byard\b|\bworkshop\b")]
513
+
514
+ return df_copy
515
+
516
+ def extract_prop_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
517
+ '''
518
+ Extract property number from an address. Remove flat/apartment/room numbers,
519
+ then extract the last number/number + letter in the string.
520
+ '''
521
+ try:
522
+ prop_no = df[col1].str.replace(r"(^\bapartment\b \w+)|(^\bapartments\b \w+)", "", regex=True\
523
+ ).str.replace(r"(^\bflat\b \w+)|(^\bflats\b \w+)", "", regex=True\
524
+ ).str.replace(r"(^\broom\b \w+)|(^\brooms\b \w+)", "", regex=True\
525
+ ).str.replace(",", "", regex=True\
526
+ ).str.extract(r"(\d+\w+|\d+)(?!.*\d+)") #"(\d+\w+|\d+)(?!.*\d+)"
527
+ except:
528
+ prop_no = np.nan
529
+
530
+ return prop_no
531
+
532
+ def extract_room_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
533
+ '''
534
+ Extract room number from an address. Find rows where the address contains 'room', then extract
535
+ the next word after 'room' in the string.
536
+ '''
537
+ try:
538
+ room_no = df[df[col1].str.lower().str.contains(r"\broom\b|\brooms\b",regex=True\
539
+ )][col1].str.replace("no.","").str.extract(r'room. (\w+)',regex=True\
540
+ ).rename(columns = {0:"room_number"})
541
+ except:
542
+ room_no = np.nan
543
+
544
+ return room_no
545
+
546
+ def extract_flat_and_other_no(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
547
+ '''
548
+ Extract flat number from an address.
549
+ It looks for letters after a property number IF THERE ARE NO MORE NUMBERS IN THE STRING,
550
+ the words following the words 'flat' or 'apartment', or
551
+ the last regex selects all characters in a word containing a digit if there are two numbers in the address
552
+ # ^\d+([a-z]|[A-Z])
553
+ '''
554
+
555
+ # the regex essentially matches strings that satisfy any of the following conditions:
556
+
557
+ # Start with a number followed by a single letter (either lowercase or uppercase) and not followed by any other number.
558
+ # Contain the word "flat" or "apartment".
559
+ # Start with a number, followed by any characters that are not alphanumeric (denoted by [^a-zA-Z0-9_]), and then another number.
560
+
561
+ replaced_series = df[df[col1].str.lower().str.replace(r"^\bflats\b","flat", regex=True).\
562
+ str.contains(\
563
+ r"^\d+([a-z]|[A-Z])(?!.*\d+)|^([a-z] |[A-Z] )(?!.*\d+)|\bflat\b|\bapartment\b|(\d+.*?)[^a-zA-Z0-9_].*?\d+")][col1].str.replace("no.","", regex=True)
564
+
565
+ extracted_series = replaced_series.str.extract(r'^\d+([a-z]|[A-Z])(?!.*\d+)')[0]
566
+
567
+ extracted_series = extracted_series[~extracted_series.index.duplicated()]
568
+ df = df[~df.index.duplicated(keep='first')]
569
+
570
+ df["prop_number"] = extracted_series
571
+
572
+ extracted_series = replaced_series.str.extract(r'(?i)(?:flat|flats) (\w+)')
573
+ if 1 in extracted_series.columns:
574
+ df["flat_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
575
+ else:
576
+ df["flat_number"] = extracted_series[0]
577
+
578
+ extracted_series = replaced_series.str.extract(r'(?i)(?:apartment|apartments) (\w+)')
579
+ if 1 in extracted_series.columns:
580
+ df["apart_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
581
+ else:
582
+ df["apart_number"] = extracted_series[0]
583
+
584
+ df["first_sec_number"] = replaced_series.str.extract(r'(\d+.*?)[^a-zA-Z0-9_].*?\d+')
585
+ df["first_letter_flat_number"] = replaced_series.str.extract(r'\b([A-Za-z])\b[^\d]* \d')
586
+ df["first_letter_no_more_numbers"] = replaced_series.str.extract(r'^([a-z] |[A-Z] )(?!.*\d+)')
587
+
588
+ return df
589
+
590
+ def extract_house_or_court_name(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
591
+ '''
592
+ Extract house or court name. Extended to include estate, buildings, and mansions
593
+ '''
594
+ extracted_series = df[col1].str.extract(r"(\w+)\s+(house|court|estate|buildings|mansions)")
595
+ if 1 in extracted_series.columns:
596
+ df["house_court_name"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
597
+ else:
598
+ df["house_court_name"] = extracted_series[0]
599
+
600
+ return df["house_court_name"]
601
+
602
+ def extract_block_and_unit_name(df:PandasDataFrame, col1:PandasSeries) -> PandasSeries:
603
+ '''
604
+ Extract house or court name. Extended to include estate, buildings, and mansions
605
+ '''
606
+
607
+ extracted_series = df[col1].str.extract(r'(?i)(?:block|blocks) (\w+)')
608
+ if 1 in extracted_series.columns:
609
+ df["block_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
610
+ else:
611
+ df["block_number"] = extracted_series[0]
612
+
613
+ extracted_series = df[col1].str.extract(r'(?i)(?:unit|units) (\w+)')
614
+ if 1 in extracted_series.columns:
615
+ df["unit_number"] = extracted_series[0].fillna(extracted_series[1]).infer_objects(copy=False)
616
+ else:
617
+ df["unit_number"] = extracted_series[0]
618
+
619
+ return df
620
+
621
+ def extract_postcode(df:PandasDataFrame, col:str) -> PandasSeries:
622
+ '''
623
+ Extract a postcode from a string column in a dataframe
624
+ '''
625
+ postcode_series = df[col].str.upper().str.extract(pat = \
626
+ "(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$)")
627
+
628
+ return postcode_series
629
+
630
+ def remove_postcode(df:PandasDataFrame, col:str) -> PandasSeries:
631
+ '''
632
+ Remove a postcode from a string column in a dataframe
633
+ '''
634
+
635
+
636
+ address_series_no_pcode = df[col].str.upper().str.replace(\
637
+ "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","",\
638
+ regex=True
639
+ ).str.lower()
640
+
641
+ return address_series_no_pcode
642
+
643
+ # Remove addresses with no numbers in at all - too high a risk of badly assigning an address
644
+ def check_no_number_addresses(df:PandasDataFrame, in_address_series:PandasSeries) -> PandasSeries:
645
+ '''
646
+ Highlight addresses from a pandas df where there are no numbers in the address.
647
+ '''
648
+ df["in_address_series_temp"] = df[in_address_series].str.lower()
649
+
650
+ no_numbers_series = df["in_address_series_temp"].str.contains("^(?!.*\d+).*$", regex=True)
651
+
652
+ df.loc[no_numbers_series == True, 'Excluded from search'] = "Excluded - no numbers in address"
653
+
654
+ df = df.drop("in_address_series_temp", axis = 1)
655
+
656
+ print(df[["full_address", "Excluded from search"]])
657
+
658
+ return df
659
+
660
+ # Exclude non-postal addresses
661
+ def remove_non_postal(df, in_address_series):
662
+ '''
663
+ Highlight non-postal addresses from a pandas df where a string series that contain specific substrings
664
+ indicating non-postal addresses like 'garage', 'parking', 'shed', etc.
665
+ '''
666
+ df["in_address_series_temp"] = df[in_address_series].str.lower()
667
+
668
+ garage_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bgarage\\b|\\bgarages\\b)", regex=True)
669
+ parking_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bparking\\b)", regex=True)
670
+ shed_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bshed\\b|\\bsheds\\b)", regex=True)
671
+ bike_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbike\\b|\\bbikes\\b)", regex=True)
672
+ bicycle_store_address_series = df["in_address_series_temp"].str.contains("(?i)(?:\\bbicycle store\\b|\\bbicycle store\\b)", regex=True)
673
+
674
+ non_postal_series = (garage_address_series | parking_address_series | shed_address_series | bike_address_series | bicycle_store_address_series)
675
+
676
+ df.loc[non_postal_series == True, 'Excluded from search'] = "Excluded - non-postal address"
677
+
678
+ df = df.drop("in_address_series_temp", axis = 1)
679
+
680
+ return df
681
+
682
+ def replace_mistaken_dates(df:PandasDataFrame, col:str) -> PandasSeries:
683
+ '''
684
+ Identify addresses that mistakenly have dates in them and replace these dates with number values
685
+ '''
686
+ # Regex pattern to identify the date-month format
687
+ pattern = r'(\d{2})-(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
688
+
689
+ # Dictionary to map month abbreviations to numbers
690
+ month_map = {
691
+ 'jan': '1', 'feb': '2', 'mar': '3', 'apr': '4', 'may': '5', 'jun': '6',
692
+ 'jul': '7', 'aug': '8', 'sep': '9', 'oct': '10', 'nov': '11', 'dec': '12'
693
+ }
694
+
695
+ # Custom replacement function
696
+ def replace_month(match):
697
+ day = match.group(1).lstrip('0') # Get the day and remove leading zeros
698
+ month = month_map[match.group(2)] # Convert month abbreviation to number
699
+ return f"{day}-{month}"
700
+
701
+ # Apply the regex replacement
702
+ corrected_addresses = df[col].str.replace(pattern, replace_month, regex = True)
703
+
704
+ return corrected_addresses
705
+
706
+ def merge_series(full_series: pd.Series, partially_filled_series: pd.Series) -> pd.Series:
707
+ '''
708
+ Merge two series. The 'full_series' is the series you want to replace values in
709
+ 'partially_filled_series' is the replacer series.
710
+ '''
711
+ replacer_series_is_null = partially_filled_series.isnull()
712
+
713
+ # Start with full_series values
714
+ merged_series = full_series.copy()
715
+
716
+ # Replace values in merged_series where partially_filled_series is not null
717
+ merged_series[~replacer_series_is_null] = partially_filled_series.dropna()
718
+
719
+ return merged_series
720
+
721
+ def clean_cols(col:str) -> str:
722
+ return col.lower().strip().replace(r" ", "_").strip()