DrishtiSharma commited on
Commit
eb569a5
·
verified ·
1 Parent(s): 843ceeb

Upload preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +260 -0
patentwiz/preprocess_data.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import zipfile
4
+ import xml.etree.ElementTree as ET
5
+ import pickle
6
+
7
+
8
+ def download_weekly_patents(year, month, day, logging):
9
+ """
10
+ Download weekly patent files from the USPTO website based on a specific date.
11
+
12
+ Parameters:
13
+ year (int): The year of the patent.
14
+ month (int): The month of the patent.
15
+ day (int): The day of the patent.
16
+ logging (bool): The boolean to print logs
17
+
18
+ Returns:
19
+ bool: True if the download is successful, False otherwise.
20
+ """
21
+
22
+ # Check if the "data" folder exists and create one if it doesn't
23
+ data_folder = os.path.join(os.getcwd(), "data")
24
+ if not os.path.exists(data_folder):
25
+ if logging:
26
+ print("Data folder not found. Creating a new 'data' folder.")
27
+ os.makedirs(data_folder)
28
+
29
+ directory = os.path.join(
30
+ os.getcwd(), "data", "ipa" + str(year)[2:] + f"{month:02d}" + f"{day:02d}"
31
+ )
32
+
33
+ if os.path.exists(directory):
34
+ print(f"File {directory} already exists. Skipping download.")
35
+ return True
36
+
37
+ if logging:
38
+ print("Building the URL...")
39
+ base_url = "https://bulkdata.uspto.gov/data/patent/application/redbook/fulltext"
40
+ file_url = (
41
+ base_url
42
+ + "/"
43
+ + str(year)
44
+ + "/ipa"
45
+ + str(year)[2:]
46
+ + f"{month:02d}"
47
+ + f"{day:02d}"
48
+ + ".zip"
49
+ )
50
+
51
+ if logging:
52
+ print(f"URL constructed: {file_url}")
53
+ r = requests.get(file_url, stream=True)
54
+
55
+ if logging:
56
+ print("Requesting the file...")
57
+ if r.status_code == 200:
58
+ if logging:
59
+ print("File retrieved successfully. Starting download...")
60
+ local_path = os.path.join(os.getcwd(), "data", "patents.zip")
61
+
62
+ with open(local_path, "wb") as f:
63
+ for chunk in r.iter_content(chunk_size=1024):
64
+ if chunk:
65
+ f.write(chunk)
66
+ if logging:
67
+ print("File downloaded successfully. Starting extraction...")
68
+ with zipfile.ZipFile(local_path, "r") as zip_ref:
69
+ zip_ref.extractall(os.path.join(os.getcwd(), "data"))
70
+
71
+ if logging:
72
+ print("File extracted successfully.")
73
+ # Deleting the ZIP file after extraction
74
+ os.remove(local_path)
75
+ if logging:
76
+ print(f"ZIP file {local_path} deleted after extraction.")
77
+
78
+ return True
79
+ else:
80
+ print(
81
+ "File could not be downloaded. Please make sure the year, month, and day are correct."
82
+ )
83
+ return False
84
+
85
+
86
+ def extract_patents(year, month, day, logging):
87
+ """
88
+ This function reads a patent file in XML format, splits it into individual patents, parse each
89
+ XML file and saves each patent as a separate txt file in a directory named 'data'.
90
+
91
+ Parameters:
92
+ year (int): The year of the patent file to process.
93
+ month (int): The month of the patent file to process.
94
+ day (int): The day of the patent file to process.
95
+ logging (bool): The boolean to print logs
96
+
97
+ Returns:
98
+ None
99
+
100
+ The function creates a separate XML file for each patent and stores these files in
101
+ a directory. The directory is named based on the year, month and day provided.
102
+ If the directory does not exist, the function creates it. The function also prints
103
+ the total number of patents found.
104
+
105
+ """
106
+
107
+ directory = os.path.join(
108
+ os.getcwd(), "data", "ipa" + str(year)[2:] + f"{month:02d}" + f"{day:02d}"
109
+ )
110
+ saved_patent_names_path = os.path.join(directory, 'saved_patent_names.pkl')
111
+
112
+
113
+ if os.path.exists(directory):
114
+ print(f"File {directory} already exists. Skipping extract.")
115
+
116
+ # Load saved_patent_names from file
117
+ with open(saved_patent_names_path, 'rb') as f:
118
+ saved_patent_names = pickle.load(f)
119
+
120
+ return saved_patent_names
121
+ else:
122
+ os.mkdir(directory)
123
+
124
+ if logging:
125
+ print("Locating the patent file...")
126
+ file_path = os.path.join(
127
+ os.getcwd(),
128
+ "data",
129
+ "ipa" + str(year)[2:] + f"{month:02d}" + f"{day:02d}" + ".xml",
130
+ )
131
+
132
+ if logging:
133
+ print("Reading the patent file...")
134
+ with open(file_path, "r") as f:
135
+ contents = f.read()
136
+
137
+ if logging:
138
+ print("Splitting the XMl file into individual XMLs...")
139
+ temp = contents.split('<?xml version="1.0" encoding="UTF-8"?>')
140
+ allXmls = [
141
+ '<?xml version="1.0" encoding="UTF-8"?>' + s.replace("\n", "") for s in temp
142
+ ]
143
+
144
+ # saving only the XMLs that contain a patent
145
+ patents = []
146
+ for xml_string in allXmls:
147
+ start_index = xml_string.find("<!DOCTYPE")
148
+ end_index = xml_string.find(">", start_index)
149
+
150
+ if start_index != -1 and end_index != -1:
151
+ doctype_declaration = xml_string[start_index : end_index + 1]
152
+ # Extract only the name of the DOCTYPE
153
+ doctype_name = doctype_declaration.split()[1]
154
+ if doctype_name == "us-patent-application":
155
+ patents.append(xml_string)
156
+
157
+ if logging:
158
+ print(f"Total patents found: {len(patents)}")
159
+ print("Writing individual patents to separate txt files...")
160
+
161
+ saved_patent_names = []
162
+ for patent in patents:
163
+ try:
164
+ root = ET.fromstring(patent)
165
+
166
+ patent_id = root.find(
167
+ ".//publication-reference/document-id/doc-number"
168
+ ).text
169
+ file_id = root.attrib["file"]
170
+
171
+ ipcr_classifications = root.findall(".//classification-ipcr")
172
+
173
+ if any(ipcr.find("./section").text == "C" for ipcr in ipcr_classifications):
174
+ description_element = root.find(".//description")
175
+ description_text = get_full_text(description_element)
176
+ description_string = " ".join(description_text)
177
+
178
+ output_file_path = os.path.join(directory, f"{file_id}.txt")
179
+ with open(output_file_path, "w") as f:
180
+ f.write(description_string)
181
+ saved_patent_names.append(f"{file_id}.txt")
182
+
183
+ elif logging:
184
+ print(
185
+ f"Patent {patent_id} does not belong to section 'C'. Skipping this patent."
186
+ )
187
+ except ET.ParseError as e:
188
+ print(f"Error while parsing patent: {patent_id}. Skipping this patent.")
189
+ print(f"Error message: {e}")
190
+
191
+ # Save saved_patent_names to file
192
+ with open(saved_patent_names_path, 'wb') as f:
193
+ pickle.dump(saved_patent_names, f)
194
+
195
+ if logging:
196
+ print("Patent extraction complete.")
197
+
198
+ # Deleting the main XML file after extraction
199
+ os.remove(file_path)
200
+
201
+ if logging:
202
+ print(f"Main XML file {file_path} deleted after extraction.")
203
+ return saved_patent_names
204
+
205
+
206
+ def get_full_text(element):
207
+ """
208
+ Recursively parse XML elements and retrieve the full text from the XML tree.
209
+
210
+ Parameters:
211
+ element (xml.etree.ElementTree.Element): The root XML element to start parsing.
212
+
213
+ Returns:
214
+ list: A list of strings containing the full text from the XML element and its children.
215
+ """
216
+
217
+ text = []
218
+ if element.text is not None and element.text.strip():
219
+ text.append(element.text.strip())
220
+ for child in element:
221
+ text.extend(get_full_text(child))
222
+ if child.tail is not None and child.tail.strip():
223
+ text.append(child.tail.strip())
224
+ return text
225
+
226
+
227
+ def parse_and_save_patents(year, month, day, logging=False):
228
+ """
229
+ Download weekly patent files from the USPTO website for a specific date, extract individual
230
+ patents from the downloaded file, parse each patent's content, and save the information
231
+ as separate text files.
232
+
233
+ Parameters:
234
+ year (int): The year of the weekly patents to download and process.
235
+ month (int): The month of the weekly patents to download and process.
236
+ day (int): The day of the weekly patents to download and process.
237
+ logging (bool): The boolean to print logs
238
+
239
+ Returns:
240
+ list: A list of strings containing the names of saved patent text files.
241
+
242
+ This function first downloads the weekly patent file, then extracts the individual patents,
243
+ and finally parses each patent's content to retrieve patent_id, file_id, and full text.
244
+ It saves the extracted information for each patent as separate text files in a directory
245
+ named 'data', with the name of each file being the corresponding 'file_id'.
246
+ The function returns a list of strings containing the names of all the saved patent text files.
247
+ """
248
+
249
+ if logging:
250
+ print("### Downloading weekly patent files...")
251
+ download_success = download_weekly_patents(year, month, day, logging)
252
+ if not download_success:
253
+ print("Failed to download the weekly patents.")
254
+ return
255
+
256
+ if logging:
257
+ print("### Extracting individual patents...")
258
+ saved_patent_names = extract_patents(year, month, day, logging)
259
+
260
+ return saved_patent_names