|
from ast import literal_eval |
|
|
|
def make_lang_list(row): |
|
languages = row["languages"] |
|
if languages == "none": |
|
return [] |
|
return literal_eval(languages) |
|
|
|
def language_count(row): |
|
return len(row["languages"]) |
|
|
|
def process_for_lang(data, modality): |
|
|
|
if modality == "NLP": |
|
data = data[data["modality"] == "nlp"] |
|
elif modality == "Audio": |
|
data = data[data["modality"] == "audio"] |
|
elif modality == "Multimodal": |
|
data = data[data["modality"] == "multimodal"] |
|
|
|
|
|
data.loc[data.languages == "False", 'languages'] = None |
|
data.loc[data.languages == {}, 'languages'] = None |
|
|
|
|
|
no_lang_count = data["languages"].isna().sum() |
|
|
|
|
|
|
|
data["languages"] = data["languages"].fillna('none') |
|
data["languages"] = data.apply(make_lang_list, axis=1) |
|
data["language_count"] = data.apply(language_count, axis=1) |
|
|
|
|
|
models_with_langs = data[data["language_count"] > 0] |
|
langs = models_with_langs["languages"].explode() |
|
langs = langs[langs != {}] |
|
total_langs = len(langs.unique()) |
|
|
|
data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1) |
|
|
|
return data, no_lang_count, total_langs, langs.unique() |
|
|
|
def filter_multilinguality(data, linguality): |
|
if linguality == "Just Multilingual": |
|
multilingual_tag = data["multilingual"] == 1 |
|
multiple_lang_tags = data["language_count"] > 1 |
|
return data[multilingual_tag | multiple_lang_tags] |
|
elif linguality == "Three or more languages": |
|
return data[data["language_count"] >= 3] |
|
else: |
|
return data |