File size: 6,436 Bytes
3e48a1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
''' 
# Web Scrapping 
[@dwancin on HuggingFace](https://huggingface.co./spaces/dwancin/web-scraping)
'''

import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Function to validate URLs
def validator(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


# Function to find files on webpage
def finder(url, soup, media_type):
    files = []

    # find image files
    if media_type == "image":
        tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
        for tag in soup.find_all('img'):
            file = tag.get('src')
            if any(tag in file for tag in tags):
                file_url = file
                if not validator(file_url):
                    file_url = urljoin(url, file_url)
                files.append(file_url)

    # find text
    elif media_type == "text":
        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
        for tag in text_tags:
            for element in soup.find_all(tag):
                files.append(element.get_text())

    # find links
    else:
        for link in soup.find_all('a'):
            file = link.get('href')
            if media_type in file:
                file_url = file
                if not validator(file_url):
                    file_url = urljoin(url, file_url)
                files.append(file_url)

    return files


# Function to download the files
def downloader(urls, folder_name):
    os.makedirs(folder_name, exist_ok=True)
    for i, url in enumerate(urls):
        response = requests.get(url, stream=True)
        file_extension = url.split(".")[-1].split("&")[0]
        url_hash = hashlib.md5(url.encode()).hexdigest()
        unique_id = str(uuid.uuid4())[:8]
        file_name = f'{url_hash}-{unique_id}.{file_extension}'
        file_name = file_name[:255]
        file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
        with open(f'{folder_name}/{file_name}', 'wb') as out_file:
            out_file.write(response.content)
        print(f"Downloaded file: {file_name}")


# Function to create zip file
def zipper(folder_name):
    if os.listdir(folder_name):
        with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
            for file in os.listdir(folder_name):
                zipf.write(f'{folder_name}/{file}')
        return f'{folder_name}.zip'
    else:
        return ""


# Function to access website
def scrapper(url, images=False, text=False):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except (requests.exceptions.RequestException, ValueError):
        raise gr.Error(f"Unable to access URL: {url}")
        return None, None
    soup = BeautifulSoup(response.content, 'html.parser')

    # Clear all the previews folder data
    if images:
        shutil.rmtree('images', ignore_errors=True)
    if text:
        shutil.rmtree('text', ignore_errors=True)

    # Add images to the image folder
    if images:
        image_urls = finder(url, soup, 'image')
        os.makedirs('images', exist_ok=True)
        if image_urls:
            downloader(image_urls, 'images')
        else:
            raise gr.Error("Found no images.")

    # Add text files to the text folder
    if text:
        text_content = finder(url, soup, 'text')
        os.makedirs('text', exist_ok=True)
        if text_content:
            with open('text/content.txt', 'w') as text_file:
                for line in text_content:
                    text_file.write(line + '\n')

    # Output folder(s) as zip files
    images_zip_file, text_zip_file = None, None
    if images and os.path.exists('images') and os.listdir('images'):
        images_zip_file = zipper('images')
    if text and os.path.exists('text') and os.listdir('text'):
        text_zip_file = zipper('text')
    return images_zip_file, text_zip_file


# Function to find requests errors
def checker(url, media_types):
    if not url:
        raise gr.Error("URL cannot be empty.")
    if not url.startswith("https://"):
        raise gr.Error("The URL must begin with https://")
    if not media_types:
        raise gr.Error("At least one media type must be selected.")
    try:
        image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
        else:
            raise gr.Error(f"HTTP Error: {e.response.status_code}")
    except TypeError as e:
        raise gr.Error(f"TypeError: {str(e)}")
    except (requests.exceptions.RequestException, ValueError):
        raise gr.Error(f"Unable to access URL: {url}")
    files = []
    if "Text" in media_types and not text_file:
        raise gr.Error("Found no text.")
    if "Images" in media_types and not image_file:
        raise gr.Error("Found no images.")
    if image_file:
        files.append(image_file)
    if text_file:
        files.append(text_file)

    print(f"Returning downloaded files from {url} in {files} ...")

    return files

# Gradio Interface
with gr.Blocks(theme="dwancin/theme") as app:
    title = gr.Markdown('''# Web Scrapping 🕵️''')
    description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
    with gr.Row():
        with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
            url_name = gr.Textbox(
                placeholder="Enter URL here",
                show_label=True,
                label="Website",
            )

            media_types = gr.CheckboxGroup(
                ["Images", "Text"],
                value="Images",
                label="Media types",
            )

            submit_button = gr.Button(
                "Submit",
                variant="primary",
                interactive=True,
            )

        with gr.Column(scale=2):
            output_files = gr.Files(
                label="Output",
                elem_id="file-list",
                size="lg",
                show_label=False,
            )
    
    submit_button.click(
        checker, 
        inputs=[url_name, media_types], 
        outputs=[output_files],
    )

app.launch()