import argparse import os import pickle from copy import deepcopy import pandas as pd from tqdm import tqdm from Preprocessing.multilinguality.SimilaritySolver import SimilaritySolver from Utility.storage_config import MODELS_DIR from Utility.utils import load_json_from_path ISO_LOOKUP_PATH = "iso_lookup.json" ISO_TO_FULLNAME_PATH = "iso_to_fullname.json" LANG_PAIRS_MAP_PATH = "lang_1_to_lang_2_to_map_dist.json" LANG_PAIRS_TREE_PATH = "lang_1_to_lang_2_to_tree_dist.json" LANG_PAIRS_ASP_PATH = "asp_dict.pkl" LANG_PAIRS_LEARNED_DIST_PATH = "lang_1_to_lang_2_to_learned_dist.json" LANG_PAIRS_ORACLE_PATH = "lang_1_to_lang_2_to_oracle_dist.json" SUPVERVISED_LANGUAGES_PATH = "supervised_languages.json" DATASET_SAVE_DIR = "distance_datasets/" class LangDistDatasetCreator(): def __init__(self, model_path, cache_root="."): self.model_path = model_path self.cache_root = cache_root self.lang_pairs_map = None self.largest_value_map_dist = None self.lang_pairs_tree = None self.lang_pairs_asp = None self.lang_pairs_learned_dist = None self.lang_pairs_oracle = None self.supervised_langs = load_json_from_path(os.path.join(cache_root, SUPVERVISED_LANGUAGES_PATH)) self.iso_lookup = load_json_from_path(os.path.join(cache_root, ISO_LOOKUP_PATH)) self.iso_to_fullname = load_json_from_path(os.path.join(cache_root, ISO_TO_FULLNAME_PATH)) def load_required_distance_lookups(self, distance_type, excluded_distances=[]): # init required distance lookups print(f"Loading required distance lookups for distance_type '{distance_type}'.") try: if distance_type == "combined": if "map" not in excluded_distances and not self.lang_pairs_map: self.lang_pairs_map = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_MAP_PATH)) self.largest_value_map_dist = 0.0 for _, values in self.lang_pairs_map.items(): for _, value in values.items(): self.largest_value_map_dist = max(self.largest_value_map_dist, value) if "tree" not in excluded_distances and not self.lang_pairs_tree: self.lang_pairs_tree = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_TREE_PATH)) if "asp" not in excluded_distances and not self.lang_pairs_asp: with open(os.path.join(self.cache_root, LANG_PAIRS_ASP_PATH), "rb") as f: self.lang_pairs_asp = pickle.load(f) elif distance_type == "map" and not self.lang_pairs_map: self.lang_pairs_map = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_MAP_PATH)) self.largest_value_map_dist = 0.0 for _, values in self.lang_pairs_map.items(): for _, value in values.items(): self.largest_value_map_dist = max(self.largest_value_map_dist, value) elif distance_type == "tree" and not self.lang_pairs_tree: self.lang_pairs_tree = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_TREE_PATH)) elif distance_type == "asp" and not self.lang_pairs_asp: with open(os.path.join(self.cache_root, LANG_PAIRS_ASP_PATH), "rb") as f: self.lang_pairs_asp = pickle.load(f) elif distance_type == "learned" and not self.lang_pairs_learned_dist: self.lang_pairs_learned_dist = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_LEARNED_DIST_PATH)) elif distance_type == "oracle" and not self.lang_pairs_oracle: self.lang_pairs_oracle = load_json_from_path(os.path.join(self.cache_root, LANG_PAIRS_ORACLE_PATH)) except FileNotFoundError as e: raise FileNotFoundError("Please create all lookup files via create_distance_lookups.py") from e def create_dataset(self, distance_type: str = "learned", zero_shot: bool = False, n_closest: int = 50, excluded_languages: list = [], excluded_distances: list = [], find_furthest: bool = False, individual_distances: bool = False, write_to_csv=True): """Create dataset with a given feature's distance in a dict, and saves it to a CSV file.""" distance_types = ["learned", "map", "tree", "asp", "combined", "random", "oracle"] if distance_type not in distance_types: raise ValueError(f"Invalid distance type '{distance_type}'. Expected one of {distance_types}") dataset_dict = dict() self.load_required_distance_lookups(distance_type, excluded_distances) sim_solver = SimilaritySolver(tree_dist=self.lang_pairs_tree, map_dist=self.lang_pairs_map, largest_value_map_dist=self.largest_value_map_dist, asp_dict=self.lang_pairs_asp, learned_dist=self.lang_pairs_learned_dist, oracle_dist=self.lang_pairs_oracle, iso_to_fullname=self.iso_to_fullname) supervised_langs = sorted(self.supervised_langs) remove_langs_suffix = "" if len(excluded_languages) > 0: remove_langs_suffix = "_no-illegal-langs" for excl_lang in excluded_languages: supervised_langs.remove(excl_lang) individual_dist_suffix, excluded_feat_suffix = "", "" if distance_type == "combined": if individual_distances: individual_dist_suffix = "_indiv-dists" if len(excluded_distances) > 0: excluded_feat_suffix = "_excl-" + "-".join(excluded_distances) furthest_suffix = "_furthest" if find_furthest else "" zero_shot_suffix = "" if zero_shot: iso_codes_to_ids = deepcopy(self.iso_lookup)[-1] zero_shot_suffix = "_zeroshot" # leave supervised-pretrained language embeddings untouched for sup_lang in supervised_langs: iso_codes_to_ids.pop(sup_lang, None) lang_codes = list(iso_codes_to_ids) else: lang_codes = supervised_langs failed_langs = [] if distance_type == "random": random_seed = 0 sorted_by = "closest" if not find_furthest else "furthest" for lang in tqdm(lang_codes, desc=f"Retrieving {sorted_by} distances"): if distance_type == "combined": feature_dict = sim_solver.find_closest_combined_distance(lang, supervised_langs, k=n_closest, individual_distances=individual_distances, excluded_features=excluded_distances, find_furthest=find_furthest) elif distance_type == "random": random_seed += 1 dataset_dict[lang] = [lang] # target language as first column feature_dict = sim_solver.find_closest(distance_type, lang, supervised_langs, k=n_closest, find_furthest=find_furthest, random_seed=random_seed) else: feature_dict = sim_solver.find_closest(distance_type, lang, supervised_langs, k=n_closest, find_furthest=find_furthest) # discard incomplete results if len(feature_dict) < n_closest: failed_langs.append(lang) continue dataset_dict[lang] = [lang] # target language as first column # create entry for a single close lang (`feature_dict` must be sorted by distance) for _, close_lang in enumerate(feature_dict): if distance_type == "combined": dist_combined = feature_dict[close_lang]["combined_distance"] close_lang_feature_list = [close_lang, dist_combined] if individual_distances: indiv_dists = feature_dict[close_lang]["individual_distances"] close_lang_feature_list.extend(indiv_dists) else: dist = feature_dict[close_lang] close_lang_feature_list = [close_lang, dist] # column order: compared close language, {feature}_dist (plus optionally indiv dists) dataset_dict[lang].extend(close_lang_feature_list) # prepare df columns dataset_columns = ["target_lang"] for i in range(n_closest): dataset_columns.extend([f"closest_lang_{i}", f"{distance_type}_dist_{i}"]) if distance_type == "combined" and individual_distances: if "map" not in excluded_distances: dataset_columns.append(f"map_dist_{i}") if "asp" not in excluded_distances: dataset_columns.append(f"asp_dist_{i}") if "tree" not in excluded_distances: dataset_columns.append(f"tree_dist_{i}") df = pd.DataFrame.from_dict(dataset_dict, orient="index") df.columns = dataset_columns if write_to_csv: out_path = os.path.join(os.path.join(self.cache_root, DATASET_SAVE_DIR), f"dataset_{distance_type}_top{n_closest}{furthest_suffix}{zero_shot_suffix}{remove_langs_suffix}{excluded_feat_suffix}{individual_dist_suffix}" + ".csv") os.makedirs(os.path.join(self.cache_root, DATASET_SAVE_DIR), exist_ok=True) df.to_csv(out_path, sep="|", index=False) print(f"Successfully retrieved distances for {len(lang_codes) - len(failed_langs)}/{len(lang_codes)} languages.") if len(failed_langs) > 0: print(f"Failed to retrieve distances for the following {len(failed_langs)} languages:\n{failed_langs}") return df if __name__ == "__main__": default_model_path = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt") # MODELS_DIR must be absolute path, the relative path will fail at this location parser = argparse.ArgumentParser() parser.add_argument("--model_path", "-m", type=str, default=default_model_path, help="model path from which to obtain pretrained language embeddings") args = parser.parse_args() dc = LangDistDatasetCreator(args.model_path) excluded_langs = [] # create datasets for evaluation of approx. lang emb methods on supervised languages dataset = dc.create_dataset(distance_type="tree", n_closest=30, zero_shot=False) dataset = dc.create_dataset(distance_type="map", n_closest=30, zero_shot=False, excluded_languages=excluded_langs) dataset = dc.create_dataset(distance_type="map", n_closest=30, zero_shot=False, find_furthest=True) dataset = dc.create_dataset(distance_type="asp", n_closest=30, zero_shot=False) dataset = dc.create_dataset(distance_type="random", n_closest=30, zero_shot=False, excluded_languages=excluded_langs) dataset = dc.create_dataset(distance_type="combined", n_closest=30, zero_shot=False, individual_distances=True) dataset = dc.create_dataset(distance_type="learned", n_closest=30, zero_shot=False) dataset = dc.create_dataset(distance_type="oracle", n_closest=30, zero_shot=False)