Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

10dee6a

verified ·

1 Parent(s): 00059cc

Update patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +13 -6

patent_downloader.py CHANGED Viewed

@@ -3,8 +3,8 @@ import os
 import requests
 import re
 import time
-import shutil
 import pandas as pd
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 from selenium import webdriver
@@ -13,6 +13,7 @@ from selenium.webdriver.chrome.options import Options
 import chromedriver_autoinstaller
 import subprocess
 class PatentDownloader:
     url = "https://patents.google.com"
@@ -94,7 +95,7 @@ class PatentDownloader:
             if not pdf_link:
                 raise FileNotFoundError(f"No valid PDF link found for patent: {patent}")
-            # Resolve the full URL and download the file
             pdf_url = urljoin("https://patentimages.storage.googleapis.com", pdf_link)
             print(f"Found PDF link: {pdf_url}")
             file_path = os.path.join(output_path, f"{patent}.pdf")
@@ -128,12 +129,17 @@ class PatentDownloader:
     @staticmethod
     def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
         """
-        Extract the first PDF link from the page.
         """
-        # Search for any <a> tags containing 'pdf'
-        pdf_links = [a['href'] for a in soup.find_all('a', href=True) if 'pdf' in a['href'].lower()]
         if pdf_links:
-            return pdf_links[0]  # Return the first valid PDF link
         return None
     @staticmethod
@@ -141,6 +147,7 @@ class PatentDownloader:
         """
         Download the PDF file from the given URL and save it.
         """
         response = requests.get(pdf_url, stream=True)
         response.raise_for_status()
         with open(file_path, 'wb') as pdf_file:

 import requests
 import re
 import time
 import pandas as pd
+import shutil
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 from selenium import webdriver
 import chromedriver_autoinstaller
 import subprocess
 class PatentDownloader:
     url = "https://patents.google.com"
             if not pdf_link:
                 raise FileNotFoundError(f"No valid PDF link found for patent: {patent}")
+            # Build the complete PDF URL
             pdf_url = urljoin("https://patentimages.storage.googleapis.com", pdf_link)
             print(f"Found PDF link: {pdf_url}")
             file_path = os.path.join(output_path, f"{patent}.pdf")
     @staticmethod
     def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
         """
+        Extract the PDF link from the page. Match links containing 'patentimages.storage.googleapis.com'.
         """
+        pdf_links = [
+            link['href'] for link in soup.find_all('a', href=True)
+            if 'patentimages.storage.googleapis.com' in link['href']
+        ]
         if pdf_links:
+            print(f"Found PDF link: {pdf_links[0]}")
+            return pdf_links[0]
+        print("No valid PDF link found on the page.")
         return None
     @staticmethod
         """
         Download the PDF file from the given URL and save it.
         """
+        print("Downloading PDF...")
         response = requests.get(pdf_url, stream=True)
         response.raise_for_status()
         with open(file_path, 'wb') as pdf_file: