import pandas as pd import streamlit as st import matplotlib.pyplot as plt import seaborn as sns from app_utils import filter_dataframe, calculate_height_to_display from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT from utils import BASE_SUMMARY_METRICS from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics from utils import left_align, right_align st.set_page_config(layout="wide") # Load PL ASR data survey data # Cache the dataframe so it's only loaded once df_data_cat = load_data_catalog() df_data_tax = load_data_taxonomy() # Filter out non available datasets df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes'] # Available and free df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '0')] # Available and paid df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '0')] # Load PL ASR benchmarks survey data df_bench_cat = load_bench_catalog() df_bench_tax = load_bench_taxonomy() about, data_cat, data_survey, data_taxonomy, bench_cat, bench_survey, bench_taxonomy = st.tabs(["PL ASR survey", "PL ASR Speech Data **Catalog**", "PL ASR Speech data **Survey**", "ASR Speech Data **Taxonomy**", "PL ASR Benchmarks Catalog", "PL ASR Benchmarks Survey", "ASR Benchmarks Taxonomy"]) with about: st.title("About Polish ASR Survey") st.markdown(INFO_MAIN, unsafe_allow_html=True) st.header("How to cite this resource?") st.markdown(CITATION_MAIN, unsafe_allow_html=True) with data_cat: st.title("Polish ASR Speech Datasets Catalog") st.markdown(INFO_CATALOG, unsafe_allow_html=True) st.header("How to use?") st.markdown(HOWTO_CATALOG, unsafe_allow_html=True) # Display catalog contents st.header("Browse the catalog content") st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True) st.header("How to cite this resource?") st.markdown(CITATION_CATALOG, unsafe_allow_html=True) with data_survey: # Display summary statistics st.title("Polish ASR Speech Datasets Survey") st.header("Polish ASR speech datasets summary statistics") df_summary_metrics = catalog_summary_statistics(df_data_cat) df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]] st.dataframe(df_basic_stats, use_container_width=False) st.header("Speech data available across Polish ASR speech datasets") df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]] st.dataframe(df_stats_audio_available, use_container_width=False) st.header("Transcribed data available across Polish ASR speech datasets") df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]] st.dataframe(df_stats_transcribed_available, use_container_width=False) # Display distribution of datasets created per year st.header("Polish ASR speech datasets created in 1997-2023") col_groupby = ['Creation year'] df_datasets_per_year = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) st.dataframe(df_datasets_per_year, use_container_width=False) st.header("Institutions contributing Polish ASR speech datasets") col_groupby = ['Publisher'] df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) st.dataframe(df_datasets_per_publisher, use_container_width=False) st.header("Institutions contributing freely available Polish ASR speech datasets") col_groupby = ['Publisher'] df_datasets_per_publisher_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) st.dataframe(df_datasets_per_publisher_free, use_container_width=False) st.header("Repositories hosting Polish ASR speech datasets") col_groupby = ['Repository'] df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) st.dataframe(df_datasets_per_repo, use_container_width=False) st.header("Public domain Polish ASR speech datasets") col_groupby = ['License', "Dataset ID"] df_datasets_public = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = []) st.dataframe(df_datasets_public, use_container_width=False) st.header("Commercialy available Polish ASR speech datasets") col_groupby = ['License', "Dataset ID"] df_datasets_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = []) st.dataframe(df_datasets_paid, use_container_width=False) st.header("Coverage of metadata across Polish ASR speech datasets") df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_data_cat, df_data_cat_available_free, df_data_cat_available_paid) st.dataframe(df_meta_all_pivot, use_container_width=False) # Display distribution of datasets for various speech types st.header("Datasets per speech type") col_groupby = ['Speech type'] df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) # sort by the size of audio transcribed df_datasets_per_speech_type = df_datasets_per_speech_type.sort_values(by='Size audio transcribed [hours]', ascending=False) st.dataframe(df_datasets_per_speech_type, use_container_width=False) # Display distribution of datasets for various speech types st.header("Distribution of available speech data per audio device - All available datasets") col_groupby = ['Audio device'] df_datasets_per_device_all = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) # sort by the size of audio transcribed df_datasets_per_device_all = df_datasets_per_device_all.sort_values(by='Size audio transcribed [hours]', ascending=False) st.dataframe(df_datasets_per_device_all, use_container_width=False) # Display distribution of datasets for various speech types st.header("Distribution of available speech data per audio device - Public domain datasets") col_groupby = ['Audio device'] df_datasets_per_device_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) # sort by the size of audio transcribed df_datasets_per_device_free = df_datasets_per_device_free.sort_values(by='Size audio transcribed [hours]', ascending=False) st.dataframe(df_datasets_per_device_free, use_container_width=False) # Display distribution of datasets for various speech types st.header("Distribution of available speech data per audio device - Commercial datasets") col_groupby = ['Audio device'] df_datasets_per_device_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) # sort by the size of audio transcribed df_datasets_per_device_paid = df_datasets_per_device_paid.sort_values(by='Size audio transcribed [hours]', ascending=False) st.dataframe(df_datasets_per_device_paid, use_container_width=False) # Display distribution of datasets for various speech types st.header("Datasets per sampling rate") col_groupby = ['Sampling rate [Hz]'] df_datasets_per_sr = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID']) # sort by the size of audio transcribed df_datasets_per_sr = df_datasets_per_sr.sort_values(by='Size audio transcribed [hours]', ascending=False) st.dataframe(df_datasets_per_sr, use_container_width=False) with data_taxonomy: st.title("Polish ASR Speech Data Taxonomy") st.header("How to use?") st.markdown(HOWTO_TAXONOMY_CAT, unsafe_allow_html=True) st.dataframe(df_data_tax, hide_index=True, use_container_width=True) st.header("How to cite?") st.markdown(CITATION_CATALOG, unsafe_allow_html=True) with bench_cat: st.write("Benchmarks catalog") # TODO - load and display benchmarks catalog st.title("Polish ASR Benchmarks Catalog") # Display catalog contents st.dataframe(filter_dataframe(df_bench_cat, "benchmarks"), hide_index=True, use_container_width=True) # Display taxonomy contents