File size: 1,350 Bytes
3bd5293 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import os
import pandas as pd
import numpy as np
n = 150000 # Chunk size. How many obs to translate per language.
df_blip = pd.read_csv("data/fine_tune_languages.csv", index_col=None)
df = pd.read_feather("data/ccs_synthetic_sv.feather")
df = df[["caption", "caption_sv", "url", "index"]]
df2 = pd.DataFrame(np.repeat(df_blip.to_numpy(), n, axis=0), columns=df_blip.columns)
df = pd.concat([df, df2], axis=1)
df["caption_multi"] = None
df = df.rename(
columns={"language_code": "multi_language_code", "language_name": "multi_language_name"}
)
df = df[
[
"caption",
"caption_sv",
"caption_multi",
"url",
"multi_language_code",
"multi_language_name",
"multi_target",
"target_code",
"opus_mt_url",
"index",
]
]
df["multi_target"] = df["multi_target"].astype("Int64")
df.loc[df["multi_language_code"] == "en", "caption_multi"] = df.loc[
df["multi_language_code"] == "en", "caption"
]
df_list = [df[i : i + n].reset_index(drop=True) for i in range(0, len(df), n)]
os.makedirs("data_multi", exist_ok=True)
for i in range(0, len(df_list)):
code = df_list[i]["multi_language_code"][0]
part_num = str(i).zfill(3)
df_list[i].to_feather(f"data_multi/{part_num}_ccs_synthetic_{code}.feather")
df.to_feather("data/ccs_synthetic_multi.feather")
|