|
from typing import List, Union, Optional |
|
import os |
|
import requests |
|
import tempfile |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
class PatentDownloader: |
|
""" |
|
A class to automate downloading patent PDFs from Google Patents. |
|
""" |
|
base_url = "https://patents.google.com/patent" |
|
|
|
def __init__(self, verbose: bool = False): |
|
""" |
|
Initialize the downloader. |
|
|
|
Parameters |
|
---------- |
|
verbose : bool |
|
If True, print detailed debug information. |
|
""" |
|
self.verbose = verbose |
|
|
|
def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]: |
|
""" |
|
Download single or multiple patent PDFs. |
|
|
|
Parameters |
|
---------- |
|
patents : str or List[str] |
|
Single patent number or a list of patent numbers. |
|
output_path : Optional[str] |
|
Directory to save the PDFs. Defaults to a temporary directory. |
|
|
|
Returns |
|
------- |
|
List[str] |
|
List of paths to the downloaded PDFs. |
|
""" |
|
if isinstance(patents, str): |
|
patents = [patents] |
|
|
|
|
|
output_dir = output_path or tempfile.gettempdir() |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
downloaded_files = [] |
|
|
|
for i, patent in enumerate(patents): |
|
try: |
|
if self.verbose: |
|
print(f"π Downloading {i+1}/{len(patents)}: {patent}") |
|
file_path = self._download_single_pdf(patent, output_dir) |
|
downloaded_files.append(file_path) |
|
print(f"β
Successfully downloaded: {file_path}") |
|
except Exception as e: |
|
print(f"β Failed to download {patent}: {e}") |
|
|
|
return downloaded_files |
|
|
|
def _download_single_pdf(self, patent_number: str, output_dir: str) -> str: |
|
""" |
|
Download a single patent PDF. |
|
|
|
Parameters |
|
---------- |
|
patent_number : str |
|
The patent number (e.g., "US8676427B1"). |
|
output_dir : str |
|
Directory to save the PDF. |
|
|
|
Returns |
|
------- |
|
str |
|
Path to the downloaded PDF file. |
|
""" |
|
|
|
patent_url = f"{self.base_url}/{patent_number}/en" |
|
|
|
if self.verbose: |
|
print(f"Fetching patent page: {patent_url}") |
|
|
|
|
|
response = requests.get(patent_url) |
|
if response.status_code != 200: |
|
raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}") |
|
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
pdf_url = self._extract_pdf_link(soup) |
|
|
|
if not pdf_url: |
|
raise Exception(f"No PDF link found for patent {patent_number}.") |
|
|
|
if self.verbose: |
|
print(f"Found PDF link: {pdf_url}") |
|
|
|
|
|
pdf_response = requests.get(pdf_url) |
|
if pdf_response.status_code != 200: |
|
raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}") |
|
|
|
|
|
file_path = os.path.join(output_dir, f"{patent_number}.pdf") |
|
with open(file_path, "wb") as pdf_file: |
|
pdf_file.write(pdf_response.content) |
|
|
|
return file_path |
|
|
|
@staticmethod |
|
def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]: |
|
""" |
|
Extract the PDF link from the page's metadata. |
|
|
|
Parameters |
|
---------- |
|
soup : BeautifulSoup |
|
Parsed HTML content of the patent page. |
|
|
|
Returns |
|
------- |
|
Optional[str] |
|
The direct PDF link if found. |
|
""" |
|
|
|
pdf_meta = soup.find("meta", {"name": "citation_pdf_url"}) |
|
if pdf_meta and pdf_meta.get("content"): |
|
return pdf_meta["content"] |
|
|
|
|
|
pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")] |
|
if pdf_links: |
|
return pdf_links[0] |
|
|
|
return None |
|
|