import pandas as pd | |
import json | |
with open("data/ccs_synthetic_filtered_large.json") as f: | |
d = json.load(f) | |
df = pd.DataFrame(d) | |
df["index"] = df.index + 1 | |
df["nr_words"] = df["caption"].apply(lambda x: len(x.split())) | |
df.to_feather("data/ccs_synthetic.feather") | |