aliceblue11 commited on
Commit
dc7031d
·
verified ·
1 Parent(s): 2d8de23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -49
app.py CHANGED
@@ -1,71 +1,58 @@
1
- import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
 
5
- # Function to scrape Naver blog title and content
6
  def scrape_naver_blog(url):
7
  try:
8
- print("Starting the scraping process...") # Debugging
9
- print(f"Target URL: {url}") # Debugging
10
 
11
- # Send a GET request to the URL
12
- headers = {
13
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
14
- }
15
  response = requests.get(url, headers=headers)
16
- response.raise_for_status() # Raise an exception for HTTP errors
17
- print("Successfully fetched the page content.") # Debugging
18
 
19
- # Parse the HTML using BeautifulSoup
20
- soup = BeautifulSoup(response.text, 'html.parser')
21
- print("HTML parsing completed.") # Debugging
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- try:
24
- # Extract title
25
- print("Attempting to extract the title...") # Debugging
26
- title_element = soup.select_one(
27
- "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(1) > div > div > div:nth-of-type(2) > div > p > span"
28
- )
29
- title = title_element.get_text(strip=True) if title_element else "Title not found"
30
- print(f"Extracted Title: {title}") # Debugging
31
- except Exception as e:
32
- title = f"Error extracting title: {e}"
33
- print(f"Title extraction error: {e}") # Debugging
34
 
35
- try:
36
- # Extract content
37
- print("Attempting to extract the content...") # Debugging
38
- content_element = soup.select_one(
39
- "body > div:nth-of-type(7) > div:nth-of-type(1) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > div > div:nth-of-type(8) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div:nth-of-type(1) > div > div:nth-of-type(3) > div:nth-of-type(4) > div > div > div > p:nth-of-type(1) > span"
40
- )
41
- content = content_element.get_text(strip=True) if content_element else "Content not found"
42
- print(f"Extracted Content: {content}") # Debugging
43
- except Exception as e:
44
- content = f"Error extracting content: {e}"
45
- print(f"Content extraction error: {e}") # Debugging
46
 
47
- # Return the results
48
  return f"제목: {title}\n내용: {content}"
49
 
50
  except Exception as e:
51
- print(f"Error fetching the page: {e}") # Debugging
52
- return f"Error fetching the page: {e}"
 
53
 
54
- # Gradio Interface
55
- def scrape_interface(url):
56
- print("Scrape interface triggered.") # Debugging
57
- result = scrape_naver_blog(url)
58
- print("Scrape process completed.") # Debugging
59
- return result
60
 
61
- interface = gr.Interface(
62
- fn=scrape_interface,
63
  inputs=gr.Textbox(label="Naver Blog URL"),
64
  outputs=gr.Textbox(label="Scraped Content"),
65
  title="Naver Blog Scraper",
66
- description="Enter the URL of a Naver blog to scrape its title and content."
67
  )
68
 
69
  if __name__ == "__main__":
70
- print("Launching Gradio interface...") # Debugging
71
- interface.launch(debug=True)
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import gradio as gr
4
 
 
5
  def scrape_naver_blog(url):
6
  try:
7
+ # 디버깅 로그: URL 확인
8
+ print(f"[DEBUG] Scraping URL: {url}")
9
 
10
+ # 네이버 블로그 HTML 가져오기
11
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
 
 
12
  response = requests.get(url, headers=headers)
 
 
13
 
14
+ # HTTP 상태 코드 확인
15
+ print(f"[DEBUG] HTTP Response Status Code: {response.status_code}")
16
+ if response.status_code != 200:
17
+ return f"Error: Unable to access the page. HTTP Status Code: {response.status_code}"
18
+
19
+ # HTML 파싱
20
+ soup = BeautifulSoup(response.text, "html.parser")
21
+
22
+ # 제목과 내용 스크래핑
23
+ title_xpath = "div > div > div > div:nth-of-type(10) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div > div > div > div:nth-of-type(1) > div > div > div:nth-of-type(2)"
24
+ content_xpath = "div > div > div > div:nth-of-type(10) > div:nth-of-type(1) > div > table:nth-of-type(2) > tbody > tr > td:nth-of-type(2) > div > div > div > div:nth-of-type(2) > div:nth-of-type(2) > div > div"
25
+
26
+ # CSS 선택자 변환
27
+ title_element = soup.select_one(title_xpath.replace(" > ", " > "))
28
+ content_element = soup.select_one(content_xpath.replace(" > ", " > "))
29
 
30
+ if not title_element or not content_element:
31
+ return "Error: Unable to locate title or content using the provided XPaths."
 
 
 
 
 
 
 
 
 
32
 
33
+ # 텍스트 추출
34
+ title = title_element.get_text(strip=True)
35
+ content = content_element.get_text(strip=True)
 
 
 
 
 
 
 
 
36
 
37
+ # 결과 반환
38
  return f"제목: {title}\n내용: {content}"
39
 
40
  except Exception as e:
41
+ # 디버깅 로그: 예외 발생
42
+ print(f"[DEBUG] Exception occurred: {str(e)}")
43
+ return f"An error occurred: {str(e)}"
44
 
45
+ # Gradio 인터페이스 설정
46
+ def gradio_interface(url):
47
+ return scrape_naver_blog(url)
 
 
 
48
 
49
+ iface = gr.Interface(
50
+ fn=gradio_interface,
51
  inputs=gr.Textbox(label="Naver Blog URL"),
52
  outputs=gr.Textbox(label="Scraped Content"),
53
  title="Naver Blog Scraper",
54
+ description="Enter a Naver Blog URL to scrape the title and content.",
55
  )
56
 
57
  if __name__ == "__main__":
58
+ iface.launch()