import io | |
import json | |
import httpx | |
from huggingface_hub import HfFileSystem, hf_hub_url | |
from huggingface_hub.utils import build_hf_headers | |
client = httpx.AsyncClient(follow_redirects=True) | |
fs = HfFileSystem() | |
def glob(path): | |
paths = fs.glob(path) | |
return paths | |
async def load_json_file(path): | |
url = to_url(path) | |
r = await client.get(url) | |
return r.json() | |
async def load_jsonlines_file(path): | |
url = to_url(path) | |
r = await client.get(url, headers=build_hf_headers()) | |
f = io.StringIO(r.text) | |
return [json.loads(line) for line in f] | |
def to_url(path): | |
_, org_name, ds_name, filename = path.split("/", 3) | |
return hf_hub_url(repo_id=f"{org_name}/{ds_name}", filename=filename, repo_type="dataset") | |