import gradio as gr from datasets import load_dataset from PIL import Image import io import time import os from datetime import datetime, timedelta import json access_token = os.environ.get("HUGGINGFACE_TOKEN") # Global variables dataset = None dataset_size = "Unknown" last_refresh_time = None REFRESH_INTERVAL = timedelta(hours=24) def load_and_prepare_dataset(): global dataset, dataset_size, last_refresh_time dataset = load_dataset( "taesiri/PhotoshopRequest-DailyDump", split="train", streaming=True, token=access_token, ) # Get dataset info dataset_info = dataset.info dataset_size = ( dataset_info.splits["train"].num_examples if dataset_info.splits.get("train") else "Unknown" ) last_refresh_time = datetime.now() def check_and_refresh_dataset(): global last_refresh_time current_time = datetime.now() if ( last_refresh_time is None or (current_time - last_refresh_time) >= REFRESH_INTERVAL ): load_and_prepare_dataset() # Initial dataset load load_and_prepare_dataset() # Load and prepare the dataset dataset = load_dataset( "taesiri/PhotoshopRequest-DailyDump", split="train", streaming=True, token=access_token, ) # Get dataset info dataset_info = dataset.info dataset_size = ( dataset_info.splits["train"].num_examples if dataset_info.splits.get("train") else "Unknown" ) BUFFER_SIZE = 1 sample_iterator = None sample_count = 0 def reshuffle_dataset(): global sample_iterator, sample_count seed = int(time.time()) # Convert time to an integer shuffled_dataset = dataset.shuffle(seed=seed, buffer_size=BUFFER_SIZE) sample_iterator = iter(shuffled_dataset) sample_count = 0 reshuffle_dataset() # Initial shuffle def get_next_sample(): check_and_refresh_dataset() global sample_count if sample_count >= BUFFER_SIZE: reshuffle_dataset() sample = next(sample_iterator) sample_count += 1 print(sample) post_id = sample["post_id"] title = sample["title"] reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}" selftext = "" try: selftext = json.loads(sample["json_data"])["post"]["selftext"] except: print("No selftext found") markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})" return ( markdown_text, sample["source_image"], sample["edited_image"], ) with gr.Blocks() as demo: gr.Markdown("# PhotoshopRequest Dataset Sampler") gr.Markdown( """ This is a preview of the PhotoshopRequest dataset. Each sample represents a Photoshop editing request post. Click the 'Sample New Item' button to retrieve a random sample from the dataset. """ ) post_info = gr.Markdown() with gr.Row(): source_image = gr.Image(label="Source Image") edited_image = gr.Image(label="Edited Image") sample_button = gr.Button("Sample New Item") info_md = gr.Markdown() def update_info(): return f"""

Dataset Size: {dataset_size} items
Last Refreshed: {last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC') if last_refresh_time else 'Unknown'}
""" sample_button.click( get_next_sample, outputs=[post_info, source_image, edited_image] ).then(update_info, outputs=[info_md]) if __name__ == "__main__": demo.launch()