Update patent_downloader.py
Browse files- patent_downloader.py +46 -85
patent_downloader.py
CHANGED
@@ -3,63 +3,42 @@ import os
|
|
3 |
import requests
|
4 |
import re
|
5 |
import time
|
6 |
-
import shutil
|
7 |
-
import subprocess
|
8 |
import pandas as pd
|
9 |
from urllib.parse import urljoin
|
10 |
import tempfile
|
11 |
-
from PyPDF2 import PdfReader
|
12 |
from selenium import webdriver
|
13 |
-
from selenium.webdriver.common.keys import Keys
|
14 |
from selenium.webdriver.chrome.service import Service
|
15 |
from selenium.webdriver.chrome.options import Options
|
16 |
-
from
|
17 |
-
from selenium.webdriver.support.ui import WebDriverWait
|
18 |
-
from selenium.webdriver.support import expected_conditions as EC
|
19 |
import chromedriver_autoinstaller
|
20 |
-
from bs4 import BeautifulSoup
|
21 |
-
|
22 |
|
23 |
|
24 |
class PatentDownloader:
|
25 |
url = "https://patents.google.com"
|
26 |
|
27 |
-
def __init__(self, verbose: bool =
|
28 |
"""
|
29 |
-
Initialize the Patent Downloader.
|
30 |
-
Parameters:
|
31 |
-
----------
|
32 |
-
verbose : bool
|
33 |
-
Print additional debug information.
|
34 |
"""
|
35 |
self.verbose = verbose
|
36 |
self.chrome_path = self.install_chrome()
|
37 |
|
38 |
def install_chrome(self) -> str:
|
39 |
"""
|
40 |
-
|
41 |
"""
|
42 |
chrome_path = "/usr/bin/google-chrome"
|
43 |
-
if not
|
44 |
-
print("
|
45 |
-
|
46 |
-
|
47 |
-
shell=True, check=True,
|
48 |
-
)
|
49 |
-
subprocess.run(
|
50 |
-
"apt-get update && apt-get install -y ./chrome.deb",
|
51 |
-
shell=True, check=True,
|
52 |
-
)
|
53 |
os.remove("chrome.deb")
|
54 |
-
|
55 |
-
if not shutil.which("google-chrome"):
|
56 |
-
raise ValueError("Google Chrome installation failed!")
|
57 |
return chrome_path
|
58 |
|
59 |
-
def download(self, patent: Union[str, List[str]], output_path: str = None,
|
60 |
-
waiting_time: int = 10) -> None:
|
61 |
"""
|
62 |
-
|
63 |
"""
|
64 |
if not output_path:
|
65 |
output_path = tempfile.gettempdir()
|
@@ -76,56 +55,41 @@ class PatentDownloader:
|
|
76 |
"""
|
77 |
chromedriver_autoinstaller.install()
|
78 |
|
79 |
-
# Set up Chrome options
|
80 |
chrome_options = Options()
|
81 |
chrome_options.binary_location = self.chrome_path
|
82 |
chrome_options.add_argument("--headless")
|
83 |
chrome_options.add_argument("--no-sandbox")
|
84 |
chrome_options.add_argument("--disable-dev-shm-usage")
|
85 |
|
86 |
-
# Initialize WebDriver
|
87 |
service = Service()
|
88 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
89 |
|
90 |
file_path = os.path.join(output_path, f"{patent}.pdf")
|
91 |
-
pdf_link = None
|
92 |
|
93 |
try:
|
94 |
-
print(f"Navigating to Google Patents...")
|
95 |
driver.get(self.url)
|
96 |
|
97 |
-
print("Entering patent number...")
|
98 |
search_input_xpath = "//input[@aria-label='Search patents']"
|
99 |
-
WebDriverWait(driver, 20).until(
|
100 |
-
search_input = driver.find_element(
|
101 |
search_input.send_keys(patent)
|
102 |
-
search_input.send_keys(
|
103 |
-
|
104 |
-
print("Waiting for the 'Download PDF' button...")
|
105 |
-
time.sleep(waiting_time) # Allow full load
|
106 |
-
page_source = driver.page_source
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
f.write(page_source)
|
111 |
-
print("Page source saved as 'page_source.html' for debugging.")
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
# First, look for the 'Download PDF' link explicitly
|
116 |
pdf_link = self.get_pdf_link(soup)
|
117 |
-
|
118 |
if not pdf_link:
|
119 |
-
raise FileNotFoundError("Could not find a valid PDF link.")
|
120 |
-
|
121 |
-
print(f"PDF link extracted: {pdf_link}")
|
122 |
|
123 |
# Download and validate the PDF
|
124 |
self.download_and_validate_pdf(pdf_link, file_path)
|
125 |
|
126 |
except Exception as e:
|
127 |
print(f"Error: {e}")
|
128 |
-
raise FileNotFoundError("
|
129 |
finally:
|
130 |
driver.quit()
|
131 |
|
@@ -134,9 +98,9 @@ class PatentDownloader:
|
|
134 |
Download multiple patent PDFs.
|
135 |
"""
|
136 |
if isinstance(patents, str):
|
137 |
-
if patents.
|
138 |
-
patents = pd.read_csv(patents)['patent_number'].
|
139 |
-
elif patents.
|
140 |
with open(patents, 'r') as file:
|
141 |
patents = file.read().splitlines()
|
142 |
else:
|
@@ -149,45 +113,42 @@ class PatentDownloader:
|
|
149 |
@staticmethod
|
150 |
def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
|
151 |
"""
|
152 |
-
Extract the PDF link from the
|
153 |
"""
|
154 |
-
# Search
|
155 |
-
|
156 |
-
if download_button and download_button.get("href"):
|
157 |
-
return download_button["href"]
|
158 |
-
|
159 |
-
# Fallback: Find any links containing 'patentimages.storage.googleapis.com'
|
160 |
-
pdf_links = [link['href'] for link in soup.find_all('a', href=True)
|
161 |
-
if 'patentimages.storage.googleapis.com' in link['href']]
|
162 |
if pdf_links:
|
163 |
-
print(f"
|
164 |
return pdf_links[0]
|
165 |
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
return None
|
168 |
|
169 |
def download_and_validate_pdf(self, pdf_link: str, file_path: str):
|
170 |
"""
|
171 |
-
Download
|
172 |
"""
|
173 |
-
print("Downloading PDF...")
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
self.validate_pdf(file_path)
|
184 |
-
except Exception as e:
|
185 |
-
raise FileNotFoundError(f"Failed to download or validate PDF: {e}")
|
186 |
|
187 |
@staticmethod
|
188 |
def validate_pdf(file_path):
|
189 |
"""
|
190 |
-
Validate
|
191 |
"""
|
192 |
try:
|
193 |
with open(file_path, "rb") as f:
|
|
|
3 |
import requests
|
4 |
import re
|
5 |
import time
|
|
|
|
|
6 |
import pandas as pd
|
7 |
from urllib.parse import urljoin
|
8 |
import tempfile
|
9 |
+
from PyPDF2 import PdfReader # PDF validation
|
10 |
from selenium import webdriver
|
|
|
11 |
from selenium.webdriver.chrome.service import Service
|
12 |
from selenium.webdriver.chrome.options import Options
|
13 |
+
from bs4 import BeautifulSoup
|
|
|
|
|
14 |
import chromedriver_autoinstaller
|
|
|
|
|
15 |
|
16 |
|
17 |
class PatentDownloader:
|
18 |
url = "https://patents.google.com"
|
19 |
|
20 |
+
def __init__(self, verbose: bool = False):
|
21 |
"""
|
22 |
+
Initialize the Patent Downloader with headless Chrome support.
|
|
|
|
|
|
|
|
|
23 |
"""
|
24 |
self.verbose = verbose
|
25 |
self.chrome_path = self.install_chrome()
|
26 |
|
27 |
def install_chrome(self) -> str:
|
28 |
"""
|
29 |
+
Install Google Chrome dynamically.
|
30 |
"""
|
31 |
chrome_path = "/usr/bin/google-chrome"
|
32 |
+
if not os.path.exists(chrome_path):
|
33 |
+
print("Installing Google Chrome...")
|
34 |
+
os.system("wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb")
|
35 |
+
os.system("apt-get update && apt-get install -y ./chrome.deb")
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
os.remove("chrome.deb")
|
|
|
|
|
|
|
37 |
return chrome_path
|
38 |
|
39 |
+
def download(self, patent: Union[str, List[str]], output_path: str = None, waiting_time: int = 10) -> None:
|
|
|
40 |
"""
|
41 |
+
Main entry to download one or multiple patents.
|
42 |
"""
|
43 |
if not output_path:
|
44 |
output_path = tempfile.gettempdir()
|
|
|
55 |
"""
|
56 |
chromedriver_autoinstaller.install()
|
57 |
|
|
|
58 |
chrome_options = Options()
|
59 |
chrome_options.binary_location = self.chrome_path
|
60 |
chrome_options.add_argument("--headless")
|
61 |
chrome_options.add_argument("--no-sandbox")
|
62 |
chrome_options.add_argument("--disable-dev-shm-usage")
|
63 |
|
|
|
64 |
service = Service()
|
65 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
66 |
|
67 |
file_path = os.path.join(output_path, f"{patent}.pdf")
|
|
|
68 |
|
69 |
try:
|
70 |
+
print(f"Navigating to Google Patents for patent: {patent}...")
|
71 |
driver.get(self.url)
|
72 |
|
|
|
73 |
search_input_xpath = "//input[@aria-label='Search patents']"
|
74 |
+
WebDriverWait(driver, 20).until(lambda d: d.find_element("xpath", search_input_xpath))
|
75 |
+
search_input = driver.find_element("xpath", search_input_xpath)
|
76 |
search_input.send_keys(patent)
|
77 |
+
search_input.send_keys("\n")
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
time.sleep(waiting_time)
|
80 |
+
soup = BeautifulSoup(driver.page_source, "html.parser")
|
|
|
|
|
81 |
|
82 |
+
# Search for the PDF link
|
|
|
|
|
83 |
pdf_link = self.get_pdf_link(soup)
|
|
|
84 |
if not pdf_link:
|
85 |
+
raise FileNotFoundError(f"Could not find a valid PDF link for patent: {patent}.")
|
|
|
|
|
86 |
|
87 |
# Download and validate the PDF
|
88 |
self.download_and_validate_pdf(pdf_link, file_path)
|
89 |
|
90 |
except Exception as e:
|
91 |
print(f"Error: {e}")
|
92 |
+
raise FileNotFoundError(f"Failed to process patent: {patent}")
|
93 |
finally:
|
94 |
driver.quit()
|
95 |
|
|
|
98 |
Download multiple patent PDFs.
|
99 |
"""
|
100 |
if isinstance(patents, str):
|
101 |
+
if patents.endswith('.csv'):
|
102 |
+
patents = pd.read_csv(patents)['patent_number'].tolist()
|
103 |
+
elif patents.endswith('.txt'):
|
104 |
with open(patents, 'r') as file:
|
105 |
patents = file.read().splitlines()
|
106 |
else:
|
|
|
113 |
@staticmethod
|
114 |
def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
|
115 |
"""
|
116 |
+
Extract the PDF link using known patterns from the page HTML.
|
117 |
"""
|
118 |
+
# Search for direct links to patentimages
|
119 |
+
pdf_links = [a['href'] for a in soup.find_all('a', href=True) if 'patentimages.storage.googleapis.com' in a['href']]
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
if pdf_links:
|
121 |
+
print(f"Direct PDF link found: {pdf_links[0]}")
|
122 |
return pdf_links[0]
|
123 |
|
124 |
+
# Fallback: Search for any "Download PDF" buttons or text links
|
125 |
+
pdf_button = soup.find('a', string=re.compile("Download PDF", re.IGNORECASE))
|
126 |
+
if pdf_button and 'href' in pdf_button.attrs:
|
127 |
+
print(f"PDF link found via Download button: {pdf_button['href']}")
|
128 |
+
return pdf_button['href']
|
129 |
+
|
130 |
+
print("No valid PDF link found in the page HTML.")
|
131 |
return None
|
132 |
|
133 |
def download_and_validate_pdf(self, pdf_link: str, file_path: str):
|
134 |
"""
|
135 |
+
Download and validate the PDF file.
|
136 |
"""
|
137 |
+
print(f"Downloading PDF from {pdf_link}...")
|
138 |
+
response = requests.get(pdf_link, stream=True)
|
139 |
+
response.raise_for_status()
|
140 |
+
|
141 |
+
with open(file_path, "wb") as file:
|
142 |
+
for chunk in response.iter_content(chunk_size=8192):
|
143 |
+
file.write(chunk)
|
144 |
+
|
145 |
+
print(f"PDF downloaded to: {file_path}")
|
146 |
+
self.validate_pdf(file_path)
|
|
|
|
|
|
|
147 |
|
148 |
@staticmethod
|
149 |
def validate_pdf(file_path):
|
150 |
"""
|
151 |
+
Validate if the file is a readable PDF with at least one page.
|
152 |
"""
|
153 |
try:
|
154 |
with open(file_path, "rb") as f:
|