import argparse import huggingface_hub def model_has_dataset(model): for tag in model.tags: if tag.startswith("dataset:"): return True return False if __name__ == "__main__": parser = argparse.ArgumentParser( prog="Giskard Retriever", description="Retrieves HF models that are bound to datasets." ) parser.add_argument( "--model_type", help="Hugging Face model types. default: text-classification", required=False, ) parser.add_argument("--output_format", help="Format of the information retrieved. Default: parquet. Options: parquet, csv, json.") args = parser.parse_args() MODEL_TYPE = args.model_type if args.model_type is not None else "text-classification" models_with_dataset = filter( model_has_dataset, huggingface_hub.list_models(filter=MODEL_TYPE, sort="likes", direction=-1) ) import pandas as pd df = pd.DataFrame( [ { "modelId": m.modelId, "modelType": MODEL_TYPE, "author": m.author, "downloads": m.downloads, "likes": m.likes, "datasets": [t[8:] for t in m.tags if t.startswith("dataset:")], } for m in models_with_dataset ] ) output_format = args.output_format if output_format is None or output_format == "parquet": df.to_parquet(f"models_{MODEL_TYPE}.parquet", index=False) elif output_format == "csv": df.to_csv(f"models_{MODEL_TYPE}.csv", columns=df.columns, index=False) elif output_format == "json": df.to_json(f"models_{MODEL_TYPE}.json", index=False)