DrishtiSharma commited on
Commit
5e0e6fc
·
verified ·
1 Parent(s): 26ad68f

Update patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +39 -12
patent_downloader.py CHANGED
@@ -22,7 +22,7 @@ import chromedriver_autoinstaller
22
  class PatentDownloader:
23
  url = "https://patents.google.com"
24
 
25
- def __init__(self, verbose: bool = False):
26
  """
27
  Initialize the Patent Downloader.
28
  Parameters:
@@ -36,12 +36,8 @@ class PatentDownloader:
36
  def install_chrome(self) -> str:
37
  """
38
  Download and install Google Chrome dynamically if not already installed.
39
- Returns:
40
- -------
41
- str: Path to the Chrome binary.
42
  """
43
  chrome_path = "/usr/bin/google-chrome"
44
-
45
  if not shutil.which("google-chrome"):
46
  print("Downloading and installing Google Chrome...")
47
  subprocess.run(
@@ -90,6 +86,7 @@ class PatentDownloader:
90
  driver = webdriver.Chrome(service=service, options=chrome_options)
91
 
92
  file_path = os.path.join(output_path, f"{patent}.pdf")
 
93
 
94
  try:
95
  print(f"Navigating to Google Patents...")
@@ -103,19 +100,29 @@ class PatentDownloader:
103
  search_input.send_keys(Keys.RETURN)
104
 
105
  print("Waiting for the 'Download PDF' button...")
106
- pdf_button_xpath = "//a[contains(text(),'Download PDF')]"
107
- WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, pdf_button_xpath)))
108
- pdf_link_element = driver.find_element(By.XPATH, pdf_button_xpath)
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # Extract the PDF link
111
- pdf_link = pdf_link_element.get_attribute("href")
112
- print(f"PDF link found: {pdf_link}")
113
 
114
  # Download and validate the PDF
115
  self.download_and_validate_pdf(pdf_link, file_path)
116
 
117
  except Exception as e:
118
- print(f"An error occurred: {e}")
119
  raise FileNotFoundError("Could not find a valid PDF link.")
120
  finally:
121
  driver.quit()
@@ -137,6 +144,26 @@ class PatentDownloader:
137
  print(f"Downloading patent: {patent}")
138
  self.get_pdf(patent, output_path, waiting_time)
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  def download_and_validate_pdf(self, pdf_link: str, file_path: str):
141
  """
142
  Download the PDF and validate its integrity.
 
22
  class PatentDownloader:
23
  url = "https://patents.google.com"
24
 
25
+ def __init__(self, verbose: bool = True):
26
  """
27
  Initialize the Patent Downloader.
28
  Parameters:
 
36
  def install_chrome(self) -> str:
37
  """
38
  Download and install Google Chrome dynamically if not already installed.
 
 
 
39
  """
40
  chrome_path = "/usr/bin/google-chrome"
 
41
  if not shutil.which("google-chrome"):
42
  print("Downloading and installing Google Chrome...")
43
  subprocess.run(
 
86
  driver = webdriver.Chrome(service=service, options=chrome_options)
87
 
88
  file_path = os.path.join(output_path, f"{patent}.pdf")
89
+ pdf_link = None
90
 
91
  try:
92
  print(f"Navigating to Google Patents...")
 
100
  search_input.send_keys(Keys.RETURN)
101
 
102
  print("Waiting for the 'Download PDF' button...")
103
+ time.sleep(waiting_time) # Allow full load
104
+ page_source = driver.page_source
105
+
106
+ # Log the page source for debugging
107
+ with open("page_source.html", "w", encoding="utf-8") as f:
108
+ f.write(page_source)
109
+ print("Page source saved as 'page_source.html' for debugging.")
110
+
111
+ soup = BeautifulSoup(page_source, "html.parser")
112
+
113
+ # First, look for the 'Download PDF' link explicitly
114
+ pdf_link = self.get_pdf_link(soup)
115
+
116
+ if not pdf_link:
117
+ raise FileNotFoundError("Could not find a valid PDF link.")
118
 
119
+ print(f"PDF link extracted: {pdf_link}")
 
 
120
 
121
  # Download and validate the PDF
122
  self.download_and_validate_pdf(pdf_link, file_path)
123
 
124
  except Exception as e:
125
+ print(f"Error: {e}")
126
  raise FileNotFoundError("Could not find a valid PDF link.")
127
  finally:
128
  driver.quit()
 
144
  print(f"Downloading patent: {patent}")
145
  self.get_pdf(patent, output_path, waiting_time)
146
 
147
+ @staticmethod
148
+ def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
149
+ """
150
+ Extract the PDF link from the parsed HTML.
151
+ """
152
+ # Search explicitly for Download PDF button
153
+ download_button = soup.find("a", string=re.compile("Download PDF", re.IGNORECASE))
154
+ if download_button and download_button.get("href"):
155
+ return download_button["href"]
156
+
157
+ # Fallback: Find any links containing 'patentimages.storage.googleapis.com'
158
+ pdf_links = [link['href'] for link in soup.find_all('a', href=True)
159
+ if 'patentimages.storage.googleapis.com' in link['href']]
160
+ if pdf_links:
161
+ print(f"Fallback: Found PDF Link: {pdf_links[0]}")
162
+ return pdf_links[0]
163
+
164
+ print("No valid PDF link found in HTML.")
165
+ return None
166
+
167
  def download_and_validate_pdf(self, pdf_link: str, file_path: str):
168
  """
169
  Download the PDF and validate its integrity.