Spaces:

kijeoung
/

blogcatch250113test

Sleeping

App Files Files Community

blogcatch250113test / app.py

kijeoung

Update app.py

2e55746 verified about 2 months ago

raw

history blame contribute delete

2.46 kB

	# app.py
	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import random
	import time

	def convert_to_mobile_url(url):
	if "blog.naver.com" in url:
	url_parts = url.split("/")
	if len(url_parts) > 4:
	user_id, post_id = url_parts[-2], url_parts[-1]
	return f"https://m.blog.naver.com/{user_id}/{post_id}"
	return url

	def scrape_naver_blog(url):
	try:
	# 모바일 URL로 변환
	mobile_url = convert_to_mobile_url(url)

	# 랜덤 딜레이 추가
	time.sleep(random.uniform(5, 8))

	# HTTP 요청 헤더 설정
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Referer": "https://www.naver.com/"
	}

	# HTTP 요청 보내기
	response = requests.get(mobile_url, headers=headers)
	response.raise_for_status()

	# HTML 파싱
	soup = BeautifulSoup(response.text, 'html.parser')

	# 제목 스크래핑
	title_div = soup.find('div', class_='se-module se-module-text se-title-text')
	if title_div:
	title = title_div.get_text(strip=True)
	else:
	title = "제목을 찾을 수 없습니다."

	# 내용 스크래핑
	content_divs = soup.find_all('div', class_='se-module se-module-text')
	content = []
	if content_divs:
	for div in content_divs:
	paragraphs = div.find_all('p')
	for p in paragraphs:
	text = p.get_text(strip=True)
	if text:
	content.append(text)
	else:
	content.append("내용을 찾을 수 없습니다.")

	return f"제목:\n{title}\n\n내용:\n" + "\n".join(content)

	except requests.exceptions.RequestException as e:
	return f"HTTP 요청 에러: {e}"
	except Exception as e:
	return f"스크래핑 에러: {e}"

	# Gradio 인터페이스 정의
	def main():
	gr.Interface(
	fn=scrape_naver_blog,
	inputs=gr.Textbox(label="네이버 블로그 URL 입력"),
	outputs=gr.Textbox(label="스크래핑 결과"),
	title="네이버 블로그 스크래퍼",
	description="네이버 블로그에서 제목과 내용을 스크래핑합니다. URL을 입력하세요."
	).launch()

	if __name__ == "__main__":
	main()