Spaces:
Sleeping
Sleeping
File size: 6,436 Bytes
3e48a1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
'''
# Web Scrapping
[@dwancin on HuggingFace](https://huggingface.co./spaces/dwancin/web-scraping)
'''
import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# Function to validate URLs
def validator(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
# Function to find files on webpage
def finder(url, soup, media_type):
files = []
# find image files
if media_type == "image":
tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
for tag in soup.find_all('img'):
file = tag.get('src')
if any(tag in file for tag in tags):
file_url = file
if not validator(file_url):
file_url = urljoin(url, file_url)
files.append(file_url)
# find text
elif media_type == "text":
text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
for tag in text_tags:
for element in soup.find_all(tag):
files.append(element.get_text())
# find links
else:
for link in soup.find_all('a'):
file = link.get('href')
if media_type in file:
file_url = file
if not validator(file_url):
file_url = urljoin(url, file_url)
files.append(file_url)
return files
# Function to download the files
def downloader(urls, folder_name):
os.makedirs(folder_name, exist_ok=True)
for i, url in enumerate(urls):
response = requests.get(url, stream=True)
file_extension = url.split(".")[-1].split("&")[0]
url_hash = hashlib.md5(url.encode()).hexdigest()
unique_id = str(uuid.uuid4())[:8]
file_name = f'{url_hash}-{unique_id}.{file_extension}'
file_name = file_name[:255]
file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
with open(f'{folder_name}/{file_name}', 'wb') as out_file:
out_file.write(response.content)
print(f"Downloaded file: {file_name}")
# Function to create zip file
def zipper(folder_name):
if os.listdir(folder_name):
with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
for file in os.listdir(folder_name):
zipf.write(f'{folder_name}/{file}')
return f'{folder_name}.zip'
else:
return ""
# Function to access website
def scrapper(url, images=False, text=False):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except (requests.exceptions.RequestException, ValueError):
raise gr.Error(f"Unable to access URL: {url}")
return None, None
soup = BeautifulSoup(response.content, 'html.parser')
# Clear all the previews folder data
if images:
shutil.rmtree('images', ignore_errors=True)
if text:
shutil.rmtree('text', ignore_errors=True)
# Add images to the image folder
if images:
image_urls = finder(url, soup, 'image')
os.makedirs('images', exist_ok=True)
if image_urls:
downloader(image_urls, 'images')
else:
raise gr.Error("Found no images.")
# Add text files to the text folder
if text:
text_content = finder(url, soup, 'text')
os.makedirs('text', exist_ok=True)
if text_content:
with open('text/content.txt', 'w') as text_file:
for line in text_content:
text_file.write(line + '\n')
# Output folder(s) as zip files
images_zip_file, text_zip_file = None, None
if images and os.path.exists('images') and os.listdir('images'):
images_zip_file = zipper('images')
if text and os.path.exists('text') and os.listdir('text'):
text_zip_file = zipper('text')
return images_zip_file, text_zip_file
# Function to find requests errors
def checker(url, media_types):
if not url:
raise gr.Error("URL cannot be empty.")
if not url.startswith("https://"):
raise gr.Error("The URL must begin with https://")
if not media_types:
raise gr.Error("At least one media type must be selected.")
try:
image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
else:
raise gr.Error(f"HTTP Error: {e.response.status_code}")
except TypeError as e:
raise gr.Error(f"TypeError: {str(e)}")
except (requests.exceptions.RequestException, ValueError):
raise gr.Error(f"Unable to access URL: {url}")
files = []
if "Text" in media_types and not text_file:
raise gr.Error("Found no text.")
if "Images" in media_types and not image_file:
raise gr.Error("Found no images.")
if image_file:
files.append(image_file)
if text_file:
files.append(text_file)
print(f"Returning downloaded files from {url} in {files} ...")
return files
# Gradio Interface
with gr.Blocks(theme="dwancin/theme") as app:
title = gr.Markdown('''# Web Scrapping 🕵️''')
description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
with gr.Row():
with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
url_name = gr.Textbox(
placeholder="Enter URL here",
show_label=True,
label="Website",
)
media_types = gr.CheckboxGroup(
["Images", "Text"],
value="Images",
label="Media types",
)
submit_button = gr.Button(
"Submit",
variant="primary",
interactive=True,
)
with gr.Column(scale=2):
output_files = gr.Files(
label="Output",
elem_id="file-list",
size="lg",
show_label=False,
)
submit_button.click(
checker,
inputs=[url_name, media_types],
outputs=[output_files],
)
app.launch()
|