Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

630b3cc

verified ·

1 Parent(s): 5c4cff8

Create test.py

Browse files

Files changed (1) hide show

test.py +170 -0

test.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from typing import List, Union, Optional
+import os
+import requests
+import re
+import time
+import shutil
+import subprocess
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import chromedriver_autoinstaller
+class PatentDownloader:
+    url = "https://patents.google.com"
+    def __init__(self, verbose: bool = False):
+        """
+        Parameters
+        ----------
+        verbose : bool
+            Print additional debug information.
+        """
+        self.verbose = verbose
+        self.chrome_path = self.install_chrome()
+    def install_chrome(self) -> str:
+        """
+        Download and install Google Chrome dynamically.
+        Returns
+        -------
+        str: Path to the Chrome binary.
+        """
+        chrome_path = "/usr/bin/google-chrome"
+        if not shutil.which("google-chrome"):
+            print("Downloading and installing Google Chrome...")
+            subprocess.run(
+                "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
+                shell=True,
+                check=True,
+            )
+            subprocess.run(
+                "apt-get update && apt-get install -y ./chrome.deb",
+                shell=True,
+                check=True,
+            )
+            os.remove("chrome.deb")
+        if not shutil.which("google-chrome"):
+            raise ValueError("Google Chrome installation failed!")
+        return chrome_path
+    def download(self, patent: Union[str, List[str]], output_path: str = "./",
+                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
+        """
+        Download patent document(s) as PDF.
+        """
+        if isinstance(patent, list) or os.path.isfile(patent):
+            self.get_pdfs(patent, output_path, waiting_time, remove_kind_codes)
+        else:
+            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
+    def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 10,
+                remove_kind_codes: Optional[List[str]] = None) -> None:
+        """
+        Download a single patent PDF.
+        """
+        if remove_kind_codes:
+            for kind_code in remove_kind_codes:
+                patent = re.sub(kind_code + "$", "", patent)
+        # Automatically install ChromeDriver
+        chromedriver_autoinstaller.install()
+        # Set up Chrome options
+        chrome_options = Options()
+        chrome_options.binary_location = self.chrome_path
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        # Initialize Selenium WebDriver
+        service = Service()
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        pdf_link = None  # Ensure pdf_link is defined
+        try:
+            driver.get(self.url)
+            # Wait for the search input field and interact with it
+            print("Waiting for the search input field...")
+            search_input_xpath = "//input[@aria-label='Search patents']"
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.XPATH, search_input_xpath))
+            )
+            element = driver.find_element(By.XPATH, search_input_xpath)
+            print("Search input field located.")
+            element.send_keys(patent)
+            element.send_keys(Keys.RETURN)
+            # Wait for search results to load
+            print("Waiting for search results to load...")
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+            time.sleep(waiting_time)
+            # Parse HTML and get the PDF link
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            pdf_link = self.get_pdf_link(soup, patent)
+        except Exception as e:
+            print(f"Error occurred: {e}")
+        finally:
+            driver.quit()
+        # Download the PDF
+        if pdf_link:
+            validate_directory(output_path)
+            pdf_content = requests.get(pdf_link).content
+            with open(os.path.join(output_path, f"{patent}.pdf"), "wb") as file:
+                file.write(pdf_content)
+            print(f">>> Patent {patent} successfully downloaded <<<")
+        else:
+            print(f"Error: PDF link for patent {patent} not found!")
+    def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./",
+                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
+        """
+        Download multiple patent PDFs from a list or file.
+        """
+        if isinstance(patents, str):
+            if patents.lower().endswith('csv'):
+                df_patents = pd.read_csv(patents)
+                patents = df_patents['patent_number'].to_list()
+            elif patents.lower().endswith('txt'):
+                with open(patents, 'r') as txt_file:
+                    patents = txt_file.read().splitlines()
+            else:
+                raise NotImplementedError(f'Unsupported file type: {patents}')
+        for i, patent in enumerate(patents):
+            print(len(patents) - i, "patent(s) remaining.")
+            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
+    @staticmethod
+    def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
+        """
+        Extract the PDF link from parsed HTML.
+        """
+        pdf_links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].lower().endswith("pdf")]
+        for link in pdf_links:
+            if patent.lower() in link.lower():
+                return link
+        return None
+def validate_directory(directory: str) -> None:
+    """
+    Ensure the output directory exists.
+    """
+    if not os.path.exists(directory):
+        os.makedirs(directory)