Spaces:

aliceblue11
/

naver_blog_00

Sleeping

App Files Files Community

aliceblue11 commited on Jan 13

Commit

dc7031d

verified ·

1 Parent(s): 2d8de23

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -49

app.py CHANGED Viewed

@@ -1,71 +1,58 @@
-import gradio as gr
 import requests
 from bs4 import BeautifulSoup
-# Function to scrape Naver blog title and content
 def scrape_naver_blog(url):
     try:
-        print("Starting the scraping process...")  # Debugging
-        print(f"Target URL: {url}")  # Debugging
-        # Send a GET request to the URL
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        }
         response = requests.get(url, headers=headers)
-        response.raise_for_status()  # Raise an exception for HTTP errors
-        print("Successfully fetched the page content.")  # Debugging
-        # Parse the HTML using BeautifulSoup
-        soup = BeautifulSoup(response.text, 'html.parser')
-        print("HTML parsing completed.")  # Debugging
-        try:
-            # Extract title
-            print("Attempting to extract the title...")  # Debugging
-            title_element = soup.select_one(
-                "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
-            )
-            title = title_element.get_text(strip=True) if title_element else "Title not found"
-            print(f"Extracted Title: {title}")  # Debugging
-        except Exception as e:
-            title = f"Error extracting title: {e}"
-            print(f"Title extraction error: {e}")  # Debugging
-        try:
-            # Extract content
-            print("Attempting to extract the content...")  # Debugging
-            content_element = soup.select_one(
-                "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
-            )
-            content = content_element.get_text(strip=True) if content_element else "Content not found"
-            print(f"Extracted Content: {content}")  # Debugging
-        except Exception as e:
-            content = f"Error extracting content: {e}"
-            print(f"Content extraction error: {e}")  # Debugging
-        # Return the results
         return f"제목: {title}\n내용: {content}"
     except Exception as e:
-        print(f"Error fetching the page: {e}")  # Debugging
-        return f"Error fetching the page: {e}"
-# Gradio Interface
-def scrape_interface(url):
-    print("Scrape interface triggered.")  # Debugging
-    result = scrape_naver_blog(url)
-    print("Scrape process completed.")  # Debugging
-    return result
-interface = gr.Interface(
-    fn=scrape_interface,
     inputs=gr.Textbox(label="Naver Blog URL"),
     outputs=gr.Textbox(label="Scraped Content"),
     title="Naver Blog Scraper",
-    description="Enter the URL of a Naver blog to scrape its title and content."
 )
 if __name__ == "__main__":
-    print("Launching Gradio interface...")  # Debugging
-    interface.launch(debug=True)

 import requests
 from bs4 import BeautifulSoup
+import gradio as gr
 def scrape_naver_blog(url):
     try:
+        # 디버깅 로그: URL 확인
+        print(f"[DEBUG] Scraping URL: {url}")
+        # 네이버 블로그 HTML 가져오기
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
         response = requests.get(url, headers=headers)
+        # HTTP 상태 코드 확인
+        print(f"[DEBUG] HTTP Response Status Code: {response.status_code}")
+        if response.status_code != 200:
+            return f"Error: Unable to access the page. HTTP Status Code: {response.status_code}"
+        # HTML 파싱
+        soup = BeautifulSoup(response.text, "html.parser")
+        # 제목과 내용 스크래핑
+        title_xpath = "div > div > div > div:nth-of-type(10) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div > div > div > div:nth-of-type(1) > div > div > div:nth-of-type(2)"
+        content_xpath = "div > div > div > div:nth-of-type(10) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div > div > div > div:nth-of-type(2) > div:nth-of-type(2) > div > div"
+        # CSS 선택자 변환
+        title_element = soup.select_one(title_xpath.replace(" > ", " > "))
+        content_element = soup.select_one(content_xpath.replace(" > ", " > "))
+        if not title_element or not content_element:
+            return "Error: Unable to locate title or content using the provided XPaths."
+        # 텍스트 추출
+        title = title_element.get_text(strip=True)
+        content = content_element.get_text(strip=True)
+        # 결과 반환
         return f"제목: {title}\n내용: {content}"
     except Exception as e:
+        # 디버깅 로그: 예외 발생 시
+        print(f"[DEBUG] Exception occurred: {str(e)}")
+        return f"An error occurred: {str(e)}"
+# Gradio 인터페이스 설정
+def gradio_interface(url):
+    return scrape_naver_blog(url)
+iface = gr.Interface(
+    fn=gradio_interface,
     inputs=gr.Textbox(label="Naver Blog URL"),
     outputs=gr.Textbox(label="Scraped Content"),
     title="Naver Blog Scraper",
+    description="Enter a Naver Blog URL to scrape the title and content.",
 )
 if __name__ == "__main__":
+    iface.launch()