Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

c336fbe

verified ·

1 Parent(s): 9aee3b0

Update patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +46 -85

patent_downloader.py CHANGED Viewed

@@ -3,63 +3,42 @@ import os
 import requests
 import re
 import time
-import shutil
-import subprocess
 import pandas as pd
 from urllib.parse import urljoin
 import tempfile
-from PyPDF2 import PdfReader
 from selenium import webdriver
-from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
 import chromedriver_autoinstaller
-from bs4 import BeautifulSoup
 class PatentDownloader:
     url = "https://patents.google.com"
-    def __init__(self, verbose: bool = True):
         """
-        Initialize the Patent Downloader.
-        Parameters:
-        ----------
-        verbose : bool
-            Print additional debug information.
         """
         self.verbose = verbose
         self.chrome_path = self.install_chrome()
     def install_chrome(self) -> str:
         """
-        Download and install Google Chrome dynamically if not already installed.
         """
         chrome_path = "/usr/bin/google-chrome"
-        if not shutil.which("google-chrome"):
-            print("Downloading and installing Google Chrome...")
-            subprocess.run(
-                "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
-                shell=True, check=True,
-            )
-            subprocess.run(
-                "apt-get update && apt-get install -y ./chrome.deb",
-                shell=True, check=True,
-            )
             os.remove("chrome.deb")
-        if not shutil.which("google-chrome"):
-            raise ValueError("Google Chrome installation failed!")
         return chrome_path
-    def download(self, patent: Union[str, List[str]], output_path: str = None,
-                 waiting_time: int = 10) -> None:
         """
-        Download one or multiple patent PDFs.
         """
         if not output_path:
             output_path = tempfile.gettempdir()
@@ -76,56 +55,41 @@ class PatentDownloader:
         """
         chromedriver_autoinstaller.install()
-        # Set up Chrome options
         chrome_options = Options()
         chrome_options.binary_location = self.chrome_path
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
-        # Initialize WebDriver
         service = Service()
         driver = webdriver.Chrome(service=service, options=chrome_options)
         file_path = os.path.join(output_path, f"{patent}.pdf")
-        pdf_link = None
         try:
-            print(f"Navigating to Google Patents...")
             driver.get(self.url)
-            print("Entering patent number...")
             search_input_xpath = "//input[@aria-label='Search patents']"
-            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, search_input_xpath)))
-            search_input = driver.find_element(By.XPATH, search_input_xpath)
             search_input.send_keys(patent)
-            search_input.send_keys(Keys.RETURN)
-            print("Waiting for the 'Download PDF' button...")
-            time.sleep(waiting_time)  # Allow full load
-            page_source = driver.page_source
-            # Log the page source for debugging
-            with open("page_source.html", "w", encoding="utf-8") as f:
-                f.write(page_source)
-            print("Page source saved as 'page_source.html' for debugging.")
-            soup = BeautifulSoup(page_source, "html.parser")
-            # First, look for the 'Download PDF' link explicitly
             pdf_link = self.get_pdf_link(soup)
             if not pdf_link:
-                raise FileNotFoundError("Could not find a valid PDF link.")
-            print(f"PDF link extracted: {pdf_link}")
             # Download and validate the PDF
             self.download_and_validate_pdf(pdf_link, file_path)
         except Exception as e:
             print(f"Error: {e}")
-            raise FileNotFoundError("Could not find a valid PDF link.")
         finally:
             driver.quit()
@@ -134,9 +98,9 @@ class PatentDownloader:
         Download multiple patent PDFs.
         """
         if isinstance(patents, str):
-            if patents.lower().endswith('csv'):
-                patents = pd.read_csv(patents)['patent_number'].to_list()
-            elif patents.lower().endswith('txt'):
                 with open(patents, 'r') as file:
                     patents = file.read().splitlines()
             else:
@@ -149,45 +113,42 @@ class PatentDownloader:
     @staticmethod
     def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
         """
-        Extract the PDF link from the parsed HTML.
         """
-        # Search explicitly for Download PDF button
-        download_button = soup.find("a", string=re.compile("Download PDF", re.IGNORECASE))
-        if download_button and download_button.get("href"):
-            return download_button["href"]
-        # Fallback: Find any links containing 'patentimages.storage.googleapis.com'
-        pdf_links = [link['href'] for link in soup.find_all('a', href=True)
-                     if 'patentimages.storage.googleapis.com' in link['href']]
         if pdf_links:
-            print(f"Fallback: Found PDF Link: {pdf_links[0]}")
             return pdf_links[0]
-        print("No valid PDF link found in HTML.")
         return None
     def download_and_validate_pdf(self, pdf_link: str, file_path: str):
         """
-        Download the PDF and validate its integrity.
         """
-        print("Downloading PDF...")
-        try:
-            response = requests.get(pdf_link, stream=True)
-            response.raise_for_status()
-            with open(file_path, "wb") as file:
-                for chunk in response.iter_content(chunk_size=8192):
-                    file.write(chunk)
-            print(f"PDF downloaded to: {file_path}")
-            # Validate PDF
-            self.validate_pdf(file_path)
-        except Exception as e:
-            raise FileNotFoundError(f"Failed to download or validate PDF: {e}")
     @staticmethod
     def validate_pdf(file_path):
         """
-        Validate that the file is a readable PDF with at least one page.
         """
         try:
             with open(file_path, "rb") as f:

 import requests
 import re
 import time
 import pandas as pd
 from urllib.parse import urljoin
 import tempfile
+from PyPDF2 import PdfReader  # PDF validation
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
 import chromedriver_autoinstaller
 class PatentDownloader:
     url = "https://patents.google.com"
+    def __init__(self, verbose: bool = False):
         """
+        Initialize the Patent Downloader with headless Chrome support.
         """
         self.verbose = verbose
         self.chrome_path = self.install_chrome()
     def install_chrome(self) -> str:
         """
+        Install Google Chrome dynamically.
         """
         chrome_path = "/usr/bin/google-chrome"
+        if not os.path.exists(chrome_path):
+            print("Installing Google Chrome...")
+            os.system("wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb")
+            os.system("apt-get update && apt-get install -y ./chrome.deb")
             os.remove("chrome.deb")
         return chrome_path
+    def download(self, patent: Union[str, List[str]], output_path: str = None, waiting_time: int = 10) -> None:
         """
+        Main entry to download one or multiple patents.
         """
         if not output_path:
             output_path = tempfile.gettempdir()
         """
         chromedriver_autoinstaller.install()
         chrome_options = Options()
         chrome_options.binary_location = self.chrome_path
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-dev-shm-usage")
         service = Service()
         driver = webdriver.Chrome(service=service, options=chrome_options)
         file_path = os.path.join(output_path, f"{patent}.pdf")
         try:
+            print(f"Navigating to Google Patents for patent: {patent}...")
             driver.get(self.url)
             search_input_xpath = "//input[@aria-label='Search patents']"
+            WebDriverWait(driver, 20).until(lambda d: d.find_element("xpath", search_input_xpath))
+            search_input = driver.find_element("xpath", search_input_xpath)
             search_input.send_keys(patent)
+            search_input.send_keys("\n")
+            time.sleep(waiting_time)
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            # Search for the PDF link
             pdf_link = self.get_pdf_link(soup)
             if not pdf_link:
+                raise FileNotFoundError(f"Could not find a valid PDF link for patent: {patent}.")
             # Download and validate the PDF
             self.download_and_validate_pdf(pdf_link, file_path)
         except Exception as e:
             print(f"Error: {e}")
+            raise FileNotFoundError(f"Failed to process patent: {patent}")
         finally:
             driver.quit()
         Download multiple patent PDFs.
         """
         if isinstance(patents, str):
+            if patents.endswith('.csv'):
+                patents = pd.read_csv(patents)['patent_number'].tolist()
+            elif patents.endswith('.txt'):
                 with open(patents, 'r') as file:
                     patents = file.read().splitlines()
             else:
     @staticmethod
     def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
         """
+        Extract the PDF link using known patterns from the page HTML.
         """
+        # Search for direct links to patentimages
+        pdf_links = [a['href'] for a in soup.find_all('a', href=True) if 'patentimages.storage.googleapis.com' in a['href']]
         if pdf_links:
+            print(f"Direct PDF link found: {pdf_links[0]}")
             return pdf_links[0]
+        # Fallback: Search for any "Download PDF" buttons or text links
+        pdf_button = soup.find('a', string=re.compile("Download PDF", re.IGNORECASE))
+        if pdf_button and 'href' in pdf_button.attrs:
+            print(f"PDF link found via Download button: {pdf_button['href']}")
+            return pdf_button['href']
+        print("No valid PDF link found in the page HTML.")
         return None
     def download_and_validate_pdf(self, pdf_link: str, file_path: str):
         """
+        Download and validate the PDF file.
         """
+        print(f"Downloading PDF from {pdf_link}...")
+        response = requests.get(pdf_link, stream=True)
+        response.raise_for_status()
+        with open(file_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        print(f"PDF downloaded to: {file_path}")
+        self.validate_pdf(file_path)
     @staticmethod
     def validate_pdf(file_path):
         """
+        Validate if the file is a readable PDF with at least one page.
         """
         try:
             with open(file_path, "rb") as f: