Spaces:
Sleeping
Sleeping
File size: 7,084 Bytes
d5cbb7a de78526 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 de78526 e283f70 de78526 e283f70 de78526 e283f70 de78526 e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 d5cbb7a e283f70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import pandas as pd
import streamlit as st
from app_utils import filter_dataframe, calculate_height_to_display
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, INFO_SURVEY, CITATION_SURVEY
from utils import BASE_SUMMARY_METRICS
from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
import matplotlib.pyplot as plt
import seaborn as sns
st.set_page_config(layout="wide")
# Load PL ASR data survey data
# Cache the dataframe so it's only loaded once
df_data_cat = load_data_catalog()
df_data_tax = load_data_taxonomy()
# Filter out non available datasets
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
# Available and free
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == 'free')]
# Available and paid
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != 'free')]
# Load PL ASR benchmarks survey data
df_bench_cat = load_bench_catalog()
df_bench_tax = load_bench_taxonomy()
data_cat, data_taxonomy, data_survey, bench_cat, bench_taxonomy, bench_survey = st.tabs(["PL ASR speech data **catalog**", "PL ASR speech data **survey**", "ASR speech data **taxonomy**", "PL ASR benchmarks catalog", "ASR benchmarks taxonomy", "PL ASR benchmarks survey"])
with data_cat:
st.title("Polish ASR Speech Datasets Catalog")
st.markdown(INFO_CATALOG, unsafe_allow_html=True)
st.header("How to use?")
st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
st.header("How to cite?")
st.markdown(CITATION_CATALOG, unsafe_allow_html=True)
# Display catalog contents
st.header("Browse the catalog content")
st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)
# Display taxonomy contents
with data_survey:
# Display summary statistics
st.title("Polish ASR Speech Datasets Survey")
st.header("Polish ASR speech datasets summary statistics")
df_summary_metrics = catalog_summary_statistics(df_data_cat)
df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
st.dataframe(df_basic_stats, use_container_width=False)
st.header("Speech data available across Polish ASR speech datasets")
df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
st.dataframe(df_stats_audio_available, use_container_width=False)
st.header("Transcribed data available across Polish ASR speech datasets")
df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
st.dataframe(df_stats_transcribed_available, use_container_width=False)
# Display distribution of datasets created per year
st.header("Polish ASR speech datasets created in 1997-2023")
col_groupby = ['Creation year']
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
st.header("Institutions contributing Polish ASR speech dataset")
col_groupby = ['Publisher']
df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
st.dataframe(df_datasets_per_publisher, use_container_width=False)
st.header("Repositories hosting Polish ASR speech datasets")
col_groupby = ['Repository']
df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
st.dataframe(df_datasets_per_repo, use_container_width=False)
st.header("Public domain Polish ASR speech datasets")
col_groupby = ['License', "Dataset ID"]
df_datasets_public = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
st.dataframe(df_datasets_public, use_container_width=False)
st.header("Commercialy available Polish ASR speech datasets")
col_groupby = ['License', "Dataset ID"]
df_datasets_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
st.dataframe(df_datasets_paid, use_container_width=False)
st.header("Coverage of metadata across Polish ASR speech datasets")
df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_data_cat, df_data_cat_available_free, df_data_cat_available_paid)
st.dataframe(df_meta_all_pivot, use_container_width=False)
# Display distribution of datasets for various speech types
st.header("Datasets per speech type")
col_groupby = ['Speech type']
df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
st.dataframe(df_datasets_per_speech_type, use_container_width=False)
# Display distribution of datasets for various speech types
st.header("Distribution of available speech data per audio device - Public domain datasets")
col_groupby = ['Audio device']
df_datasets_per_device = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
st.dataframe(df_datasets_per_device, use_container_width=False)
# Display distribution of datasets for various speech types
st.header("Distribution of available speech data per audio device - Commercial datasets")
col_groupby = ['Audio device']
df_datasets_per_device = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
st.dataframe(df_datasets_per_device, use_container_width=False)
with bench_cat:
st.write("Benchmarks catalog")
# TODO - load and display benchmarks catalog
st.title("Polish ASR Benchmarks Catalog")
# Display catalog contents
st.dataframe(filter_dataframe(df_bench_cat, "benchmarks"), hide_index=True, use_container_width=True)
# Display taxonomy contents |