DrishtiSharma commited on
Commit
10dee6a
·
verified ·
1 Parent(s): 00059cc

Update patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +13 -6
patent_downloader.py CHANGED
@@ -3,8 +3,8 @@ import os
3
  import requests
4
  import re
5
  import time
6
- import shutil
7
  import pandas as pd
 
8
  from urllib.parse import urljoin
9
  from bs4 import BeautifulSoup
10
  from selenium import webdriver
@@ -13,6 +13,7 @@ from selenium.webdriver.chrome.options import Options
13
  import chromedriver_autoinstaller
14
  import subprocess
15
 
 
16
  class PatentDownloader:
17
  url = "https://patents.google.com"
18
 
@@ -94,7 +95,7 @@ class PatentDownloader:
94
  if not pdf_link:
95
  raise FileNotFoundError(f"No valid PDF link found for patent: {patent}")
96
 
97
- # Resolve the full URL and download the file
98
  pdf_url = urljoin("https://patentimages.storage.googleapis.com", pdf_link)
99
  print(f"Found PDF link: {pdf_url}")
100
  file_path = os.path.join(output_path, f"{patent}.pdf")
@@ -128,12 +129,17 @@ class PatentDownloader:
128
  @staticmethod
129
  def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
130
  """
131
- Extract the first PDF link from the page.
132
  """
133
- # Search for any <a> tags containing 'pdf'
134
- pdf_links = [a['href'] for a in soup.find_all('a', href=True) if 'pdf' in a['href'].lower()]
 
 
135
  if pdf_links:
136
- return pdf_links[0] # Return the first valid PDF link
 
 
 
137
  return None
138
 
139
  @staticmethod
@@ -141,6 +147,7 @@ class PatentDownloader:
141
  """
142
  Download the PDF file from the given URL and save it.
143
  """
 
144
  response = requests.get(pdf_url, stream=True)
145
  response.raise_for_status()
146
  with open(file_path, 'wb') as pdf_file:
 
3
  import requests
4
  import re
5
  import time
 
6
  import pandas as pd
7
+ import shutil
8
  from urllib.parse import urljoin
9
  from bs4 import BeautifulSoup
10
  from selenium import webdriver
 
13
  import chromedriver_autoinstaller
14
  import subprocess
15
 
16
+
17
  class PatentDownloader:
18
  url = "https://patents.google.com"
19
 
 
95
  if not pdf_link:
96
  raise FileNotFoundError(f"No valid PDF link found for patent: {patent}")
97
 
98
+ # Build the complete PDF URL
99
  pdf_url = urljoin("https://patentimages.storage.googleapis.com", pdf_link)
100
  print(f"Found PDF link: {pdf_url}")
101
  file_path = os.path.join(output_path, f"{patent}.pdf")
 
129
  @staticmethod
130
  def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
131
  """
132
+ Extract the PDF link from the page. Match links containing 'patentimages.storage.googleapis.com'.
133
  """
134
+ pdf_links = [
135
+ link['href'] for link in soup.find_all('a', href=True)
136
+ if 'patentimages.storage.googleapis.com' in link['href']
137
+ ]
138
  if pdf_links:
139
+ print(f"Found PDF link: {pdf_links[0]}")
140
+ return pdf_links[0]
141
+
142
+ print("No valid PDF link found on the page.")
143
  return None
144
 
145
  @staticmethod
 
147
  """
148
  Download the PDF file from the given URL and save it.
149
  """
150
+ print("Downloading PDF...")
151
  response = requests.get(pdf_url, stream=True)
152
  response.raise_for_status()
153
  with open(file_path, 'wb') as pdf_file: