Spaces:
Runtime error
Runtime error
from tqdm.auto import tqdm | |
from datasets import load_dataset | |
from concurrent.futures import ThreadPoolExecutor | |
from functools import partial | |
import io | |
import urllib | |
import PIL.Image | |
from datasets import load_dataset | |
from datasets.utils.file_utils import get_datasets_user_agent | |
import os | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
dataset = load_dataset("biglam/berlin_state_library_ocr") | |
file_names = dataset['train']['file name'] | |
ppns = dataset['train']['ppn'] | |
assert len(file_names)==len(ppns) | |
def create_url(filename, PPN): | |
page = filename.split(".")[0] | |
# PPN = row['ppn'] | |
return f"https://content.staatsbibliothek-berlin.de/dc/PPN{PPN}-{page}/full/full/0/default.jpg" | |
urls = [] | |
for f, p in tqdm(zip(file_names, ppns)): | |
url = create_url(f,p) | |
urls.append(url) | |
len(urls) | |
dataset = dataset['train'].add_column("url",urls) | |
dataset = dataset.select(range(100_000)) | |
USER_AGENT = get_datasets_user_agent() | |
def fetch_single_image(image_url, timeout=None, retries=0): | |
for _ in range(retries + 1): | |
try: | |
request = urllib.request.Request( | |
image_url, | |
data=None, | |
headers={"user-agent": USER_AGENT}, | |
) | |
with urllib.request.urlopen(request, timeout=timeout) as req: | |
image = PIL.Image.open(io.BytesIO(req.read())) | |
break | |
except Exception: | |
image = None | |
return image | |
def fetch_images(batch, num_threads, timeout=None, retries=0): | |
fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries) | |
with ThreadPoolExecutor(max_workers=num_threads) as executor: | |
batch["image"] = list(executor.map(fetch_single_image_with_args, batch["url"])) | |
return batch | |
num_threads = 20 | |
dset = dataset.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": num_threads}) | |
dset.push_to_hub('davanstrien/berlin_state_library_ocr_with_images') | |