Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
import urllib.parse | |
import mimetypes | |
import io | |
import zipfile | |
import re | |
# Page configuration | |
st.set_page_config(page_title="ImageHarvesters", layout="wide") | |
# Custom CSS | |
st.markdown(""" | |
<style> | |
/* Main container styling */ | |
.main { | |
padding: 2rem; | |
border-radius: 1rem; | |
background-color: #ffffff; | |
max-width: 1200px; | |
margin: 0 auto; | |
} | |
/* Title and headers */ | |
h1 { | |
color: #2563eb; | |
text-align: center; | |
margin-bottom: 2rem; | |
font-weight: 700; | |
font-size: 2.5rem; | |
} | |
.stSubheader { | |
color: #1e40af; | |
font-size: 1.5rem; | |
font-weight: 600; | |
margin: 1.5rem 0; | |
} | |
/* Button styling */ | |
.stButton>button { | |
width: 100%; | |
background-color: #2563eb; | |
color: white; | |
border: none; | |
padding: 0.75rem 1.5rem; | |
border-radius: 0.5rem; | |
font-weight: 600; | |
transition: all 0.2s ease; | |
} | |
.stButton>button:hover { | |
background-color: #1e40af; | |
transform: translateY(-2px); | |
box-shadow: 0 4px 6px rgba(37, 99, 235, 0.2); | |
} | |
/* Image card styling */ | |
.image-card { | |
background: white; | |
border-radius: 1rem; | |
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
overflow: hidden; | |
margin-bottom: 1.5rem; | |
transition: all 0.3s ease; | |
} | |
.image-card:hover { | |
transform: translateY(-5px); | |
box-shadow: 0 8px 12px rgba(0, 0, 0, 0.15); | |
} | |
.image-container { | |
position: relative; | |
padding-top: 75%; | |
} | |
.image-container img { | |
position: absolute; | |
top: 0; | |
left: 0; | |
width: 100%; | |
height: 100%; | |
object-fit: cover; | |
} | |
.image-info { | |
padding: 1rem; | |
font-size: 0.9rem; | |
color: #4b5563; | |
border-top: 1px solid #e5e7eb; | |
} | |
/* Selected image state */ | |
.selected { | |
border: 3px solid #2563eb; | |
box-shadow: 0 8px 16px rgba(37, 99, 235, 0.2); | |
} | |
/* Input fields */ | |
.url-input, .number-input { | |
border: 2px solid #e5e7eb; | |
border-radius: 0.5rem; | |
padding: 0.75rem; | |
margin-bottom: 1rem; | |
width: 100%; | |
transition: border-color 0.2s ease; | |
} | |
.url-input:focus, .number-input:focus { | |
border-color: #2563eb; | |
outline: none; | |
} | |
/* Alert messages */ | |
.stSuccess { | |
background-color: #ecfdf5; | |
color: #065f46; | |
border-radius: 0.5rem; | |
padding: 1rem; | |
border-left: 4px solid #059669; | |
} | |
.stWarning { | |
background-color: #fffbeb; | |
color: #92400e; | |
border-radius: 0.5rem; | |
padding: 1rem; | |
border-left: 4px solid #d97706; | |
} | |
.stError { | |
background-color: #fef2f2; | |
color: #991b1b; | |
border-radius: 0.5rem; | |
padding: 1rem; | |
border-left: 4px solid #dc2626; | |
} | |
/* Checkbox styling */ | |
.stCheckbox { | |
padding: 0.5rem; | |
} | |
/* Responsive design */ | |
@media only screen and (max-width: 768px) { | |
.main { | |
padding: 1rem; | |
} | |
h1 { | |
font-size: 2rem; | |
} | |
.image-card { | |
margin-bottom: 1rem; | |
} | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.title("ImageHarvester") | |
# Initialize session state for URLs | |
if 'urls' not in st.session_state: | |
st.session_state.urls = [''] | |
def add_url(): | |
st.session_state.urls.append('') | |
def remove_url(index): | |
st.session_state.urls.pop(index) | |
def is_valid_url(url): | |
regex = re.compile( | |
r'^(?:http|ftp)s?://' # http:// or https:// | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... | |
r'localhost|' # localhost... | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 | |
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 | |
r'(?::\d+)?' # optional port | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
return re.match(regex, url) is not None | |
def get_file_extension(content_type): | |
extension = mimetypes.guess_extension(content_type) | |
return extension if extension else '.jpg' | |
def fetch_images(url, max_images): | |
if not is_valid_url(url): | |
st.warning(f"Invalid URL: {url}") | |
return [] | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Referer': url | |
} | |
response = session.get(url, headers=headers) | |
st.info(f"Status code for {url}: {response.status_code}") | |
if response.status_code != 200: | |
st.warning(f"Unexpected status code for {url}: {response.status_code}. Attempting to proceed anyway.") | |
soup = BeautifulSoup(response.content, 'html.parser') | |
img_tags = soup.find_all('img') | |
if not img_tags: | |
st.warning(f"No images found on {url}.") | |
return [] | |
images = [] | |
for i, img in enumerate(img_tags): | |
if i >= max_images: | |
break | |
img_url = img.get('src') | |
if img_url: | |
if not img_url.startswith(('http://', 'https://')): | |
img_url = urllib.parse.urljoin(url, img_url) | |
images.append(img_url) | |
return images | |
except requests.exceptions.RequestException as e: | |
st.error(f"An error occurred for {url}: {str(e)}") | |
return [] | |
def download_images(selected_images): | |
try: | |
zip_buffer = io.BytesIO() | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Referer': url | |
} | |
with zipfile.ZipFile(zip_buffer, 'w') as zip_file: | |
for i, img_url in enumerate(selected_images): | |
img_response = session.get(img_url, headers=headers, timeout=10) | |
img_response.raise_for_status() | |
content_type = img_response.headers.get('content-type', '').split(';')[0].strip() | |
file_extension = get_file_extension(content_type) | |
file_name = f'image_{i+1}{file_extension}' | |
zip_file.writestr(file_name, img_response.content) | |
zip_buffer.seek(0) | |
return zip_buffer | |
except requests.exceptions.RequestException as e: | |
st.error(f"An error occurred while downloading images: {str(e)}") | |
return None | |
# Initialize the requests session | |
session = requests.Session() | |
# Input fields for URLs | |
st.subheader("Enter Website URLs") | |
for i, url in enumerate(st.session_state.urls): | |
col1, col2 = st.columns([10, 1]) | |
with col1: | |
st.session_state.urls[i] = st.text_input(f"URL {i+1}", value=url, key=f"url_{i}", help="Enter the URL of the website from which you want to download images.", placeholder="https://example.com", ) | |
with col2: | |
if st.button("Remove", key=f"remove_{i}"): | |
remove_url(i) | |
st.rerun() | |
if st.button("Add URL"): | |
add_url() | |
max_images_per_url = st.number_input("Max images per URL:", min_value=1, value=10, step=1) | |
if st.button("Fetch Images", key="fetch"): | |
all_images = [] | |
for url in st.session_state.urls: | |
if not is_valid_url(url): | |
st.warning(f"Invalid URL: {url}") | |
continue | |
with st.spinner(f"Fetching images from {url}..."): | |
images = fetch_images(url, max_images_per_url) | |
all_images.extend(images) | |
if all_images: | |
st.session_state.images = all_images | |
st.session_state.selected_images = [False] * len(all_images) | |
st.success(f"Found {len(all_images)} images in total. Select the images you want to download.") | |
else: | |
st.warning("No images found or could not fetch images from any of the provided URLs.") | |
if 'images' in st.session_state: | |
st.subheader("Fetched Images") | |
# Buttons for Select All and Clear Selection | |
col1, col2, col3 = st.columns([1, 1, 1]) | |
with col1: | |
if st.button("Select All"): | |
st.session_state.selected_images = [True] * len(st.session_state.images) | |
with col2: | |
if st.button("Clear"): | |
st.session_state.selected_images = [False] * len(st.session_state.images) | |
# Calculate the number of columns | |
num_cols = 4 | |
columns = st.columns(num_cols) | |
selected_images = [] | |
for i, img_url in enumerate(st.session_state.images): | |
checkbox_key = f"check_{i}" | |
# Determine the column to place the image in | |
col = columns[i % num_cols] | |
# Display the image and checkbox in the determined column | |
with col: | |
st.session_state.selected_images[i] = st.checkbox("Select Image", key=checkbox_key, value=st.session_state.selected_images[i]) | |
img_class = "selected" if st.session_state.selected_images[i] else "" | |
st.markdown(f""" | |
<div class="image-card {img_class}"> | |
<div class="image-container"> | |
<img src="{img_url}" alt="image_{i+1}"> | |
</div> | |
<div class="image-info"> | |
{f"image_{i+1}"} | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
if st.session_state.selected_images[i]: | |
selected_images.append(img_url) | |
if selected_images: | |
if st.button("Download Selected Images"): | |
with st.spinner("Preparing download..."): | |
zip_buffer = download_images(selected_images) | |
if zip_buffer: | |
st.download_button( | |
label="Download ZIP", | |
data=zip_buffer, | |
file_name="selected_images.zip", | |
mime="application/zip" | |
) | |
else: | |
st.error("Failed to prepare the download. Please try again.") | |
else: | |
st.info("Select one or more images to download.") |