from tqdm.auto import tqdm from datasets import load_dataset from concurrent.futures import ThreadPoolExecutor from functools import partial import io import urllib import PIL.Image from datasets import load_dataset from datasets.utils.file_utils import get_datasets_user_agent import os os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" dataset = load_dataset("biglam/berlin_state_library_ocr") file_names = dataset['train']['file name'] ppns = dataset['train']['ppn'] assert len(file_names)==len(ppns) def create_url(filename, PPN): page = filename.split(".")[0] # PPN = row['ppn'] return f"https://content.staatsbibliothek-berlin.de/dc/PPN{PPN}-{page}/full/full/0/default.jpg" urls = [] for f, p in tqdm(zip(file_names, ppns)): url = create_url(f,p) urls.append(url) len(urls) dataset = dataset['train'].add_column("url",urls) dataset = dataset.select(range(100_000)) USER_AGENT = get_datasets_user_agent() def fetch_single_image(image_url, timeout=None, retries=0): for _ in range(retries + 1): try: request = urllib.request.Request( image_url, data=None, headers={"user-agent": USER_AGENT}, ) with urllib.request.urlopen(request, timeout=timeout) as req: image = PIL.Image.open(io.BytesIO(req.read())) break except Exception: image = None return image def fetch_images(batch, num_threads, timeout=None, retries=0): fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries) with ThreadPoolExecutor(max_workers=num_threads) as executor: batch["image"] = list(executor.map(fetch_single_image_with_args, batch["url"])) return batch num_threads = 20 dset = dataset.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": num_threads}) dset.push_to_hub('davanstrien/berlin_state_library_ocr_with_images')