web-scraper

Sleeping

File size: 6,436 Bytes

3e48a1e

''' 
# Web Scrapping 
[@dwancin on HuggingFace](https://huggingface.co./spaces/dwancin/web-scraping)
'''

import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Function to validate URLs
def validator(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


# Function to find files on webpage
def finder(url, soup, media_type):
    files = []

    # find image files
    if media_type == "image":
        tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
        for tag in soup.find_all('img'):
            file = tag.get('src')
            if any(tag in file for tag in tags):
                file_url = file
                if not validator(file_url):
                    file_url = urljoin(url, file_url)
                files.append(file_url)

    # find text
    elif media_type == "text":
        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
        for tag in text_tags:
            for element in soup.find_all(tag):
                files.append(element.get_text())

    # find links
    else:
        for link in soup.find_all('a'):
            file = link.get('href')
            if media_type in file:
                file_url = file
                if not validator(file_url):
                    file_url = urljoin(url, file_url)
                files.append(file_url)

    return files


# Function to download the files
def downloader(urls, folder_name):
    os.makedirs(folder_name, exist_ok=True)
    for i, url in enumerate(urls):
        response = requests.get(url, stream=True)
        file_extension = url.split(".")[-1].split("&")[0]
        url_hash = hashlib.md5(url.encode()).hexdigest()
        unique_id = str(uuid.uuid4())[:8]
        file_name = f'{url_hash}-{unique_id}.{file_extension}'
        file_name = file_name[:255]
        file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
        with open(f'{folder_name}/{file_name}', 'wb') as out_file:
            out_file.write(response.content)
        print(f"Downloaded file: {file_name}")


# Function to create zip file
def zipper(folder_name):
    if os.listdir(folder_name):
        with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
            for file in os.listdir(folder_name):
                zipf.write(f'{folder_name}/{file}')
        return f'{folder_name}.zip'
    else:
        return ""


# Function to access website
def scrapper(url, images=False, text=False):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except (requests.exceptions.RequestException, ValueError):
        raise gr.Error(f"Unable to access URL: {url}")
        return None, None
    soup = BeautifulSoup(response.content, 'html.parser')

    # Clear all the previews folder data
    if images:
        shutil.rmtree('images', ignore_errors=True)
    if text:
        shutil.rmtree('text', ignore_errors=True)

    # Add images to the image folder
    if images:
        image_urls = finder(url, soup, 'image')
        os.makedirs('images', exist_ok=True)
        if image_urls:
            downloader(image_urls, 'images')
        else:
            raise gr.Error("Found no images.")

    # Add text files to the text folder
    if text:
        text_content = finder(url, soup, 'text')
        os.makedirs('text', exist_ok=True)
        if text_content:
            with open('text/content.txt', 'w') as text_file:
                for line in text_content:
                    text_file.write(line + '\n')

    # Output folder(s) as zip files
    images_zip_file, text_zip_file = None, None
    if images and os.path.exists('images') and os.listdir('images'):
        images_zip_file = zipper('images')
    if text and os.path.exists('text') and os.listdir('text'):
        text_zip_file = zipper('text')
    return images_zip_file, text_zip_file


# Function to find requests errors
def checker(url, media_types):
    if not url:
        raise gr.Error("URL cannot be empty.")
    if not url.startswith("https://"):
        raise gr.Error("The URL must begin with https://")
    if not media_types:
        raise gr.Error("At least one media type must be selected.")
    try:
        image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
        else:
            raise gr.Error(f"HTTP Error: {e.response.status_code}")
    except TypeError as e:
        raise gr.Error(f"TypeError: {str(e)}")
    except (requests.exceptions.RequestException, ValueError):
        raise gr.Error(f"Unable to access URL: {url}")
    files = []
    if "Text" in media_types and not text_file:
        raise gr.Error("Found no text.")
    if "Images" in media_types and not image_file:
        raise gr.Error("Found no images.")
    if image_file:
        files.append(image_file)
    if text_file:
        files.append(text_file)

    print(f"Returning downloaded files from {url} in {files} ...")

    return files

# Gradio Interface
with gr.Blocks(theme="dwancin/theme") as app:
    title = gr.Markdown('''# Web Scrapping 🕵️''')
    description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
    with gr.Row():
        with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
            url_name = gr.Textbox(
                placeholder="Enter URL here",
                show_label=True,
                label="Website",
            )

            media_types = gr.CheckboxGroup(
                ["Images", "Text"],
                value="Images",
                label="Media types",
            )

            submit_button = gr.Button(
                "Submit",
                variant="primary",
                interactive=True,
            )

        with gr.Column(scale=2):
            output_files = gr.Files(
                label="Output",
                elem_id="file-list",
                size="lg",
                show_label=False,
            )
    
    submit_button.click(
        checker, 
        inputs=[url_name, media_types], 
        outputs=[output_files],
    )

app.launch()