DrishtiSharma commited on
Commit
c336fbe
·
verified ·
1 Parent(s): 9aee3b0

Update patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +46 -85
patent_downloader.py CHANGED
@@ -3,63 +3,42 @@ import os
3
  import requests
4
  import re
5
  import time
6
- import shutil
7
- import subprocess
8
  import pandas as pd
9
  from urllib.parse import urljoin
10
  import tempfile
11
- from PyPDF2 import PdfReader
12
  from selenium import webdriver
13
- from selenium.webdriver.common.keys import Keys
14
  from selenium.webdriver.chrome.service import Service
15
  from selenium.webdriver.chrome.options import Options
16
- from selenium.webdriver.common.by import By
17
- from selenium.webdriver.support.ui import WebDriverWait
18
- from selenium.webdriver.support import expected_conditions as EC
19
  import chromedriver_autoinstaller
20
- from bs4 import BeautifulSoup
21
-
22
 
23
 
24
  class PatentDownloader:
25
  url = "https://patents.google.com"
26
 
27
- def __init__(self, verbose: bool = True):
28
  """
29
- Initialize the Patent Downloader.
30
- Parameters:
31
- ----------
32
- verbose : bool
33
- Print additional debug information.
34
  """
35
  self.verbose = verbose
36
  self.chrome_path = self.install_chrome()
37
 
38
  def install_chrome(self) -> str:
39
  """
40
- Download and install Google Chrome dynamically if not already installed.
41
  """
42
  chrome_path = "/usr/bin/google-chrome"
43
- if not shutil.which("google-chrome"):
44
- print("Downloading and installing Google Chrome...")
45
- subprocess.run(
46
- "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
47
- shell=True, check=True,
48
- )
49
- subprocess.run(
50
- "apt-get update && apt-get install -y ./chrome.deb",
51
- shell=True, check=True,
52
- )
53
  os.remove("chrome.deb")
54
-
55
- if not shutil.which("google-chrome"):
56
- raise ValueError("Google Chrome installation failed!")
57
  return chrome_path
58
 
59
- def download(self, patent: Union[str, List[str]], output_path: str = None,
60
- waiting_time: int = 10) -> None:
61
  """
62
- Download one or multiple patent PDFs.
63
  """
64
  if not output_path:
65
  output_path = tempfile.gettempdir()
@@ -76,56 +55,41 @@ class PatentDownloader:
76
  """
77
  chromedriver_autoinstaller.install()
78
 
79
- # Set up Chrome options
80
  chrome_options = Options()
81
  chrome_options.binary_location = self.chrome_path
82
  chrome_options.add_argument("--headless")
83
  chrome_options.add_argument("--no-sandbox")
84
  chrome_options.add_argument("--disable-dev-shm-usage")
85
 
86
- # Initialize WebDriver
87
  service = Service()
88
  driver = webdriver.Chrome(service=service, options=chrome_options)
89
 
90
  file_path = os.path.join(output_path, f"{patent}.pdf")
91
- pdf_link = None
92
 
93
  try:
94
- print(f"Navigating to Google Patents...")
95
  driver.get(self.url)
96
 
97
- print("Entering patent number...")
98
  search_input_xpath = "//input[@aria-label='Search patents']"
99
- WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, search_input_xpath)))
100
- search_input = driver.find_element(By.XPATH, search_input_xpath)
101
  search_input.send_keys(patent)
102
- search_input.send_keys(Keys.RETURN)
103
-
104
- print("Waiting for the 'Download PDF' button...")
105
- time.sleep(waiting_time) # Allow full load
106
- page_source = driver.page_source
107
 
108
- # Log the page source for debugging
109
- with open("page_source.html", "w", encoding="utf-8") as f:
110
- f.write(page_source)
111
- print("Page source saved as 'page_source.html' for debugging.")
112
 
113
- soup = BeautifulSoup(page_source, "html.parser")
114
-
115
- # First, look for the 'Download PDF' link explicitly
116
  pdf_link = self.get_pdf_link(soup)
117
-
118
  if not pdf_link:
119
- raise FileNotFoundError("Could not find a valid PDF link.")
120
-
121
- print(f"PDF link extracted: {pdf_link}")
122
 
123
  # Download and validate the PDF
124
  self.download_and_validate_pdf(pdf_link, file_path)
125
 
126
  except Exception as e:
127
  print(f"Error: {e}")
128
- raise FileNotFoundError("Could not find a valid PDF link.")
129
  finally:
130
  driver.quit()
131
 
@@ -134,9 +98,9 @@ class PatentDownloader:
134
  Download multiple patent PDFs.
135
  """
136
  if isinstance(patents, str):
137
- if patents.lower().endswith('csv'):
138
- patents = pd.read_csv(patents)['patent_number'].to_list()
139
- elif patents.lower().endswith('txt'):
140
  with open(patents, 'r') as file:
141
  patents = file.read().splitlines()
142
  else:
@@ -149,45 +113,42 @@ class PatentDownloader:
149
  @staticmethod
150
  def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
151
  """
152
- Extract the PDF link from the parsed HTML.
153
  """
154
- # Search explicitly for Download PDF button
155
- download_button = soup.find("a", string=re.compile("Download PDF", re.IGNORECASE))
156
- if download_button and download_button.get("href"):
157
- return download_button["href"]
158
-
159
- # Fallback: Find any links containing 'patentimages.storage.googleapis.com'
160
- pdf_links = [link['href'] for link in soup.find_all('a', href=True)
161
- if 'patentimages.storage.googleapis.com' in link['href']]
162
  if pdf_links:
163
- print(f"Fallback: Found PDF Link: {pdf_links[0]}")
164
  return pdf_links[0]
165
 
166
- print("No valid PDF link found in HTML.")
 
 
 
 
 
 
167
  return None
168
 
169
  def download_and_validate_pdf(self, pdf_link: str, file_path: str):
170
  """
171
- Download the PDF and validate its integrity.
172
  """
173
- print("Downloading PDF...")
174
- try:
175
- response = requests.get(pdf_link, stream=True)
176
- response.raise_for_status()
177
- with open(file_path, "wb") as file:
178
- for chunk in response.iter_content(chunk_size=8192):
179
- file.write(chunk)
180
- print(f"PDF downloaded to: {file_path}")
181
-
182
- # Validate PDF
183
- self.validate_pdf(file_path)
184
- except Exception as e:
185
- raise FileNotFoundError(f"Failed to download or validate PDF: {e}")
186
 
187
  @staticmethod
188
  def validate_pdf(file_path):
189
  """
190
- Validate that the file is a readable PDF with at least one page.
191
  """
192
  try:
193
  with open(file_path, "rb") as f:
 
3
  import requests
4
  import re
5
  import time
 
 
6
  import pandas as pd
7
  from urllib.parse import urljoin
8
  import tempfile
9
+ from PyPDF2 import PdfReader # PDF validation
10
  from selenium import webdriver
 
11
  from selenium.webdriver.chrome.service import Service
12
  from selenium.webdriver.chrome.options import Options
13
+ from bs4 import BeautifulSoup
 
 
14
  import chromedriver_autoinstaller
 
 
15
 
16
 
17
  class PatentDownloader:
18
  url = "https://patents.google.com"
19
 
20
+ def __init__(self, verbose: bool = False):
21
  """
22
+ Initialize the Patent Downloader with headless Chrome support.
 
 
 
 
23
  """
24
  self.verbose = verbose
25
  self.chrome_path = self.install_chrome()
26
 
27
  def install_chrome(self) -> str:
28
  """
29
+ Install Google Chrome dynamically.
30
  """
31
  chrome_path = "/usr/bin/google-chrome"
32
+ if not os.path.exists(chrome_path):
33
+ print("Installing Google Chrome...")
34
+ os.system("wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb")
35
+ os.system("apt-get update && apt-get install -y ./chrome.deb")
 
 
 
 
 
 
36
  os.remove("chrome.deb")
 
 
 
37
  return chrome_path
38
 
39
+ def download(self, patent: Union[str, List[str]], output_path: str = None, waiting_time: int = 10) -> None:
 
40
  """
41
+ Main entry to download one or multiple patents.
42
  """
43
  if not output_path:
44
  output_path = tempfile.gettempdir()
 
55
  """
56
  chromedriver_autoinstaller.install()
57
 
 
58
  chrome_options = Options()
59
  chrome_options.binary_location = self.chrome_path
60
  chrome_options.add_argument("--headless")
61
  chrome_options.add_argument("--no-sandbox")
62
  chrome_options.add_argument("--disable-dev-shm-usage")
63
 
 
64
  service = Service()
65
  driver = webdriver.Chrome(service=service, options=chrome_options)
66
 
67
  file_path = os.path.join(output_path, f"{patent}.pdf")
 
68
 
69
  try:
70
+ print(f"Navigating to Google Patents for patent: {patent}...")
71
  driver.get(self.url)
72
 
 
73
  search_input_xpath = "//input[@aria-label='Search patents']"
74
+ WebDriverWait(driver, 20).until(lambda d: d.find_element("xpath", search_input_xpath))
75
+ search_input = driver.find_element("xpath", search_input_xpath)
76
  search_input.send_keys(patent)
77
+ search_input.send_keys("\n")
 
 
 
 
78
 
79
+ time.sleep(waiting_time)
80
+ soup = BeautifulSoup(driver.page_source, "html.parser")
 
 
81
 
82
+ # Search for the PDF link
 
 
83
  pdf_link = self.get_pdf_link(soup)
 
84
  if not pdf_link:
85
+ raise FileNotFoundError(f"Could not find a valid PDF link for patent: {patent}.")
 
 
86
 
87
  # Download and validate the PDF
88
  self.download_and_validate_pdf(pdf_link, file_path)
89
 
90
  except Exception as e:
91
  print(f"Error: {e}")
92
+ raise FileNotFoundError(f"Failed to process patent: {patent}")
93
  finally:
94
  driver.quit()
95
 
 
98
  Download multiple patent PDFs.
99
  """
100
  if isinstance(patents, str):
101
+ if patents.endswith('.csv'):
102
+ patents = pd.read_csv(patents)['patent_number'].tolist()
103
+ elif patents.endswith('.txt'):
104
  with open(patents, 'r') as file:
105
  patents = file.read().splitlines()
106
  else:
 
113
  @staticmethod
114
  def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
115
  """
116
+ Extract the PDF link using known patterns from the page HTML.
117
  """
118
+ # Search for direct links to patentimages
119
+ pdf_links = [a['href'] for a in soup.find_all('a', href=True) if 'patentimages.storage.googleapis.com' in a['href']]
 
 
 
 
 
 
120
  if pdf_links:
121
+ print(f"Direct PDF link found: {pdf_links[0]}")
122
  return pdf_links[0]
123
 
124
+ # Fallback: Search for any "Download PDF" buttons or text links
125
+ pdf_button = soup.find('a', string=re.compile("Download PDF", re.IGNORECASE))
126
+ if pdf_button and 'href' in pdf_button.attrs:
127
+ print(f"PDF link found via Download button: {pdf_button['href']}")
128
+ return pdf_button['href']
129
+
130
+ print("No valid PDF link found in the page HTML.")
131
  return None
132
 
133
  def download_and_validate_pdf(self, pdf_link: str, file_path: str):
134
  """
135
+ Download and validate the PDF file.
136
  """
137
+ print(f"Downloading PDF from {pdf_link}...")
138
+ response = requests.get(pdf_link, stream=True)
139
+ response.raise_for_status()
140
+
141
+ with open(file_path, "wb") as file:
142
+ for chunk in response.iter_content(chunk_size=8192):
143
+ file.write(chunk)
144
+
145
+ print(f"PDF downloaded to: {file_path}")
146
+ self.validate_pdf(file_path)
 
 
 
147
 
148
  @staticmethod
149
  def validate_pdf(file_path):
150
  """
151
+ Validate if the file is a readable PDF with at least one page.
152
  """
153
  try:
154
  with open(file_path, "rb") as f: