davanstrien's picture
davanstrien HF staff
Upload folder using huggingface_hub
39f4de2
from tqdm.auto import tqdm
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import io
import urllib
import PIL.Image
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
dataset = load_dataset("biglam/berlin_state_library_ocr")
file_names = dataset['train']['file name']
ppns = dataset['train']['ppn']
assert len(file_names)==len(ppns)
def create_url(filename, PPN):
page = filename.split(".")[0]
# PPN = row['ppn']
return f"https://content.staatsbibliothek-berlin.de/dc/PPN{PPN}-{page}/full/full/0/default.jpg"
urls = []
for f, p in tqdm(zip(file_names, ppns)):
url = create_url(f,p)
urls.append(url)
len(urls)
dataset = dataset['train'].add_column("url",urls)
dataset = dataset.select(range(100_000))
USER_AGENT = get_datasets_user_agent()
def fetch_single_image(image_url, timeout=None, retries=0):
for _ in range(retries + 1):
try:
request = urllib.request.Request(
image_url,
data=None,
headers={"user-agent": USER_AGENT},
)
with urllib.request.urlopen(request, timeout=timeout) as req:
image = PIL.Image.open(io.BytesIO(req.read()))
break
except Exception:
image = None
return image
def fetch_images(batch, num_threads, timeout=None, retries=0):
fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries)
with ThreadPoolExecutor(max_workers=num_threads) as executor:
batch["image"] = list(executor.map(fetch_single_image_with_args, batch["url"]))
return batch
num_threads = 20
dset = dataset.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": num_threads})
dset.push_to_hub('davanstrien/berlin_state_library_ocr_with_images')