File size: 2,456 Bytes
cd4f45f
b79a379
83f2766
cd4f45f
5448c39
 
87047c3
276a412
 
 
 
 
 
 
 
87047c3
 
276a412
 
 
5448c39
2e55746
5448c39
 
 
 
 
 
 
cd4f45f
5448c39
83f2766
cd4f45f
 
 
 
 
 
 
 
 
 
 
 
8c2ca26
 
 
 
 
 
 
 
 
cd4f45f
8c2ca26
cd4f45f
8c2ca26
83f2766
 
cd4f45f
b79a379
cd4f45f
 
 
 
 
 
 
 
 
 
 
b79a379
83f2766
cd4f45f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# app.py
import gradio as gr
import requests
from bs4 import BeautifulSoup
import random
import time

def convert_to_mobile_url(url):
    if "blog.naver.com" in url:
        url_parts = url.split("/")
        if len(url_parts) > 4:
            user_id, post_id = url_parts[-2], url_parts[-1]
            return f"https://m.blog.naver.com/{user_id}/{post_id}"
    return url

def scrape_naver_blog(url):
    try:
        # ๋ชจ๋ฐ”์ผ URL๋กœ ๋ณ€ํ™˜
        mobile_url = convert_to_mobile_url(url)

        # ๋žœ๋ค ๋”œ๋ ˆ์ด ์ถ”๊ฐ€
        time.sleep(random.uniform(5, 8))

        # HTTP ์š”์ฒญ ํ—ค๋” ์„ค์ •
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Referer": "https://www.naver.com/"
        }

        # HTTP ์š”์ฒญ ๋ณด๋‚ด๊ธฐ
        response = requests.get(mobile_url, headers=headers)
        response.raise_for_status()

        # HTML ํŒŒ์‹ฑ
        soup = BeautifulSoup(response.text, 'html.parser')

        # ์ œ๋ชฉ ์Šคํฌ๋ž˜ํ•‘
        title_div = soup.find('div', class_='se-module se-module-text se-title-text')
        if title_div:
            title = title_div.get_text(strip=True)
        else:
            title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."

        # ๋‚ด์šฉ ์Šคํฌ๋ž˜ํ•‘
        content_divs = soup.find_all('div', class_='se-module se-module-text')
        content = []
        if content_divs:
            for div in content_divs:
                paragraphs = div.find_all('p')
                for p in paragraphs:
                    text = p.get_text(strip=True)
                    if text:
                        content.append(text)
        else:
            content.append("๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")

        return f"์ œ๋ชฉ:\n{title}\n\n๋‚ด์šฉ:\n" + "\n".join(content)

    except requests.exceptions.RequestException as e:
        return f"HTTP ์š”์ฒญ ์—๋Ÿฌ: {e}"
    except Exception as e:
        return f"์Šคํฌ๋ž˜ํ•‘ ์—๋Ÿฌ: {e}"

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ •์˜
def main():
    gr.Interface(
        fn=scrape_naver_blog,
        inputs=gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL ์ž…๋ ฅ"),
        outputs=gr.Textbox(label="์Šคํฌ๋ž˜ํ•‘ ๊ฒฐ๊ณผ"),
        title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ",
        description="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ์—์„œ ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ์„ ์Šคํฌ๋ž˜ํ•‘ํ•ฉ๋‹ˆ๋‹ค. URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
    ).launch()

if __name__ == "__main__":
    main()