Spaces:
Sleeping
Sleeping
import requests | |
import pandas as pd | |
import streamlit as st | |
catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d') | |
# TODO - extract from the catalog name | |
BASE_SUMMARY_METRICS = [ | |
"Catalog last update date", | |
"Unique Polish speech datasets producers", | |
"Identified datasets reported in the public domain", | |
"Datasets available to the public (free and paid)", | |
"Fraction of reported datasets available to the public [%]", | |
"Speech data reported in the public domain [hours]", | |
"Speech data available total [hours]", | |
"Speech data available free of charge [hours]", | |
"Speech data available commercially [hours]", | |
"Reported vs available speech data ratio [%]", | |
"Transcribed speech data reported in the public domain [hours]", | |
"Transcribed speech data available total [hours]", | |
"Transcribed speech data available free of charge [hours]", | |
"Transcribed speech data available commercially [hours]", | |
"Reported vs available transcribed speech data ratio [%]", | |
] | |
def download_tsv_from_google_sheet(sheet_url): | |
# Modify the Google Sheet URL to export it as TSV | |
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=') | |
# Send a GET request to download the TSV file | |
response = requests.get(tsv_url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Read the TSV content into a pandas DataFrame | |
from io import StringIO | |
tsv_content = StringIO(response.text) | |
df = pd.read_csv(tsv_content, sep='\t') | |
return df | |
else: | |
print("Failed to download the TSV file.") | |
return None | |
def load_data_catalog(): | |
print("Reading speech data catalog") | |
catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0" | |
df_catalog = download_tsv_from_google_sheet(catalog_url) | |
return(df_catalog) | |
def load_data_taxonomy(): | |
print("Reading speech data survey taxonomy") | |
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057" | |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url) | |
return(df_taxonomy) | |
def load_bench_catalog(): | |
print("Reading ASR benchmarks catalog") | |
catalog_url="https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=0" | |
df_catalog = download_tsv_from_google_sheet(catalog_url) | |
return(df_catalog) | |
def load_bench_taxonomy(): | |
print("Reading ASR benchmarks survey taxonomy") | |
taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057" | |
df_taxonomy = download_tsv_from_google_sheet(taxonomy_url) | |
return(df_taxonomy) | |
def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']): | |
""" | |
Function to generate a summary view of datasets by speech type and other relevant metrics. | |
Args: | |
- df_cat (pd.DataFrame): The base dataframe containing dataset information. | |
- col_sum (str or list): The column(s) to sum. | |
- col_count (str or list): The column(s) to count. | |
- col_groupby (str or list): The column(s) to group the datasets by. | |
- col_percent (str): The column to calculate the percentage of total. | |
Returns: | |
- pd.DataFrame: A dataframe summarizing datasets by speech type and other relevant metrics. | |
""" | |
# Convert col_sum, col_count, and col_groupby to lists if they are not already | |
if not isinstance(col_sum, list): | |
col_sum = [col_sum] | |
if not isinstance(col_count, list): | |
col_count = [col_count] | |
if not isinstance(col_groupby, list): | |
col_groupby = [col_groupby] | |
# First, ensure that the data types and potential missing values are handled correctly | |
for col in col_sum: | |
num_values = df_cat[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0) | |
df_cat[col] = num_values | |
# Aggregating datasets by provided column type | |
summary = df_cat.groupby(col_groupby).agg({ | |
**{col: 'sum' for col in col_sum}, | |
**{col: 'count' for col in col_count} | |
}).reset_index() | |
col_name_percent = 'Percent of total' | |
if col_percent is not None: | |
# Calculating the percentage | |
total = summary[col_percent].sum(axis=1) | |
summary[col_name_percent] = round(total / total.sum() * 100, 2) | |
# Sorting the summary by the sum of the column | |
summary.sort_values(by=col_sum[0], ascending=False, inplace=True) | |
# Replacing index with the groupby column | |
summary.reset_index(drop=True, inplace=True) | |
summary.set_index(col_groupby, inplace=True) | |
# Rename the column to a more descriptive name | |
if len(col_count) == 0: | |
col_name_count = None | |
elif len(col_count) == 1: | |
col_name_count = 'Count ' + col_count[0] | |
summary.rename(columns={col_count[0]: col_name_count }, inplace=True) | |
summary[col_name_count] = summary[col_name_count].astype(int) | |
else: | |
#TODO - add support for renaming multiple count columns | |
pass | |
# Make the order of columns as follows 'Count Dataset ID', Total transcribed [hours], 'Percent of total' | |
if col_percent is None: | |
if col_name_count not in summary.columns: | |
summary = summary[col_sum] | |
else: | |
summary = summary[[col_name_count] + col_sum] | |
else: | |
if col_name_count not in summary.columns: | |
summary = summary[col_sum + [col_name_percent]] | |
else: | |
summary = summary[[col_name_count] + col_sum + [col_name_percent]] | |
# Sort by the provided column col_sort | |
col_sort = col_groupby if col_sort is None else col_sort | |
summary.sort_values(by=col_sort, ascending=False, inplace=True) | |
# Replace 0 with no-info in columns with sum | |
for col in col_sum: | |
summary[col] = summary[col].replace(0, 'no-info') | |
return summary | |
def datasets_count_and_size_standard(df_cat, col_groupby): | |
return datasets_count_and_size(df_cat, col_groupby, col_sort=col_groupby, col_percent=['Size audio transcribed [hours]'], col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count=['Dataset ID']) | |
def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid): | |
#TODO - add number of speakers and recordings | |
# 'Speaker id info', 'Part of speech annotation', 'Named entity annotation', 'Emotion annotation' | |
meta_data_cols = ['Gender info', 'Age info', 'Accent info', 'Nativity info', 'Time alignement annotation'] | |
meta_coverage_all_sets = {} | |
meta_coverage_free_sets = {} | |
meta_coverage_paid_sets = {} | |
col_name_sum_size = 'Size audio transcribed [hours]' | |
col_name_count = 'Count Dataset ID' | |
col_name_percent = 'Percent of total' | |
#, 'Named entity annotation', 'Emotion annotation'] | |
for meta_data_col in meta_data_cols: | |
df_datasets_per_meta_paid = datasets_count_and_size_standard(df_cat_available_paid, meta_data_col) | |
#print(df_datasets_per_meta_paid) | |
if 'yes' in df_datasets_per_meta_paid.index: | |
meta_coverage_paid_sets[meta_data_col] = df_datasets_per_meta_paid.loc['yes'] | |
else: | |
meta_coverage_paid_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} | |
df_datasets_per_meta_all = datasets_count_and_size_standard(df_cat, meta_data_col) | |
#print(df_datasets_per_meta_all) | |
# select row where index has value "yes" and column name is "Percent of total" | |
if 'yes' in df_datasets_per_meta_all.index: | |
meta_coverage_all_sets[meta_data_col] = df_datasets_per_meta_all.loc['yes'] | |
else: | |
meta_coverage_all_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} | |
df_datasets_per_meta_free = datasets_count_and_size_standard(df_cat_available_free, meta_data_col) | |
#print(df_datasets_per_meta_free) | |
# check if index has value "yes", if not assign 0 | |
if 'yes' in df_datasets_per_meta_free.index: | |
meta_coverage_free_sets[meta_data_col] = df_datasets_per_meta_free.loc['yes'] | |
else: | |
meta_coverage_free_sets[meta_data_col] = {col_name_sum_size:0, col_name_count:0, col_name_percent:0} | |
#merge all free and paid dataframes | |
df_meta_free = pd.DataFrame.from_dict(meta_coverage_free_sets, orient='index') | |
df_meta_free[col_name_count] = df_meta_free[col_name_count].astype(int) | |
df_meta_paid = pd.DataFrame.from_dict(meta_coverage_paid_sets, orient='index') | |
df_meta_paid[col_name_count] = df_meta_paid[col_name_count].astype(int) | |
df_meta_free['Type'] = 'Free' | |
df_meta_paid['Type'] = 'Paid' | |
df_meta_all_flat = pd.concat([df_meta_free, df_meta_paid]) | |
#transform to compare free and paid column by column | |
df_meta_all_pivot = df_meta_all_flat.reset_index() | |
df_meta_all_pivot = df_meta_all_pivot.rename(columns={'index':'Metadata'}) | |
df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent]) | |
df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int) | |
return(df_meta_all_flat, df_meta_all_pivot) | |
def catalog_summary_statistics(df_cat): | |
""" | |
Function to generate summary statistics for the speech data catalog. | |
Args: | |
- df_cat (pd.DataFrame): The base dataframe containing dataset information. | |
Returns: | |
- pd.DataFrame: A dataframe summarizing the speech data catalog. | |
""" | |
col_name_transcribed = 'Size audio transcribed [hours]' | |
col_name_audio= 'Size audio total [hours]' | |
# Convert numerical fields to numeric type | |
df_cat[col_name_audio] = pd.to_numeric(df_cat[col_name_audio], errors='coerce') | |
df_cat[col_name_transcribed] = pd.to_numeric(df_cat[col_name_transcribed], errors='coerce') | |
# Filter out non-available datasets | |
df_cat_available = df_cat[df_cat['Available online'] == 'yes'] | |
df_cat_free = df_cat[df_cat['Price - non-commercial usage'] == 'free'] | |
df_cat_commercial = df_cat[df_cat['Price - non-commercial usage'] != 'free'] | |
# Available and free | |
df_cat_available_free = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] == 'free')] | |
# Available and paid | |
df_cat_available_paid = df_cat[(df_cat['Available online'] == 'yes') & (df_cat['Price - non-commercial usage'] != 'free')] | |
# Basic Calculations | |
identified_datasets_count = df_cat.shape[0] | |
accessible_datasets_count = df_cat_available.shape[0] | |
unique_producers_count = df_cat['Publisher'].nunique() | |
accessible_datasets_fraction = round((accessible_datasets_count / identified_datasets_count) * 100, 2) | |
# Total audio available and other dependent calculations | |
audio_reported = round(df_cat[col_name_audio].sum(), 2) | |
audio_accessible = round(df_cat_available[col_name_audio].sum(), 2) | |
audio_accessible_free = round(df_cat_available_free[col_name_audio].sum(), 2) | |
audio_accessible_paid = round(df_cat_available_paid[col_name_audio].sum(), 2) | |
transcribed_audio_reported = round(df_cat[col_name_transcribed].sum(), 2) | |
transcribed_audio_accessible = round(df_cat_available[col_name_transcribed].sum(), 2) | |
transcribed_audio_accessible_free = round(df_cat_available_free[col_name_transcribed].sum(), 2) | |
transcribed_audio_accessible_paid = round(df_cat_available_paid[col_name_transcribed].sum(), 2) | |
# available vs Reported Speech Material Ratio | |
accessible_vs_reported_audio_ratio = round((audio_accessible / audio_reported) * 100, 2) | |
accessible_vs_reported_transcribed_ratio = round((transcribed_audio_accessible / transcribed_audio_reported) * 100, 2) | |
# Finalizing the metrics dictionary | |
metrics_dict = { | |
"Metric": BASE_SUMMARY_METRICS, | |
"Value": [ | |
catalog_last_update_date, | |
unique_producers_count, | |
identified_datasets_count, | |
accessible_datasets_count, | |
accessible_datasets_fraction, | |
audio_reported, | |
audio_accessible, | |
audio_accessible_free, | |
audio_accessible_paid, | |
accessible_vs_reported_audio_ratio, | |
transcribed_audio_reported, | |
transcribed_audio_accessible, | |
transcribed_audio_accessible_free, | |
transcribed_audio_accessible_paid, | |
accessible_vs_reported_transcribed_ratio, | |
] | |
} | |
# Convert the dictionary into a DataFrame | |
metrics_df = pd.DataFrame(metrics_dict) | |
metrics_df.reset_index(drop=True, inplace=True) | |
metrics_df.set_index("Metric", inplace=True) | |
return(metrics_df) |