medmac01
Added multilingual_clip module
3bd5293
raw
history blame
269 Bytes
import pandas as pd
import json
with open("data/ccs_synthetic_filtered_large.json") as f:
d = json.load(f)
df = pd.DataFrame(d)
df["index"] = df.index + 1
df["nr_words"] = df["caption"].apply(lambda x: len(x.split()))
df.to_feather("data/ccs_synthetic.feather")