File size: 9,979 Bytes
d5cbb7a
 
ad8c37c
 
d5cbb7a
 
4d749f0
d5cbb7a
e283f70
d5cbb7a
4eee292
d5cbb7a
e283f70
d5cbb7a
 
e283f70
d5cbb7a
e283f70
 
d5cbb7a
 
e283f70
d5cbb7a
ad8c37c
d5cbb7a
 
ad8c37c
e283f70
 
 
 
 
 
a958ea3
e283f70
 
968224e
 
 
 
 
 
 
e283f70
 
 
 
 
 
de78526
e283f70
 
de78526
e283f70
 
968224e
 
 
e283f70
 
 
 
 
 
 
 
 
4eee292
e283f70
 
 
 
 
d5cbb7a
e283f70
 
 
d5cbb7a
 
e283f70
 
 
4eee292
d5cbb7a
4eee292
d5cbb7a
ad8c37c
e283f70
 
 
d5cbb7a
ad8c37c
 
 
 
 
 
e283f70
 
 
 
d5cbb7a
e283f70
 
 
 
d5cbb7a
e283f70
 
 
 
d5cbb7a
e283f70
 
 
d5cbb7a
e283f70
 
 
 
ad8c37c
 
e283f70
d5cbb7a
ad8c37c
 
 
 
 
 
 
 
 
e283f70
 
 
ad8c37c
 
 
 
d5cbb7a
e283f70
 
 
ad8c37c
 
 
 
 
 
 
 
 
 
 
 
 
d5cbb7a
4d749f0
 
 
 
 
 
 
 
 
e283f70
 
 
 
d5cbb7a
 
e283f70
 
d5cbb7a
e283f70
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns

from app_utils import filter_dataframe, calculate_height_to_display
from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMARK, CITATION_BENCHMARK, HOWTO_BENCHMARK, INFO_MAIN, CITATION_MAIN, HOWTO_TAXONOMY_CAT
from utils import BASE_SUMMARY_METRICS
from utils import  load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
from utils import left_align, right_align

st.set_page_config(layout="wide")


# Load PL ASR data survey data
# Cache the dataframe so it's only loaded once
df_data_cat = load_data_catalog()
df_data_tax = load_data_taxonomy()

# Filter out non available datasets
df_data_cat_available = df_data_cat[df_data_cat['Available online'] == 'yes']
# Available and free
df_data_cat_available_free = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] == '0')]

# Available and paid    
df_data_cat_available_paid = df_data_cat[(df_data_cat['Available online'] == 'yes') & (df_data_cat['Price - non-commercial usage'] != '0')]


# Load PL ASR benchmarks survey data
df_bench_cat = load_bench_catalog()
df_bench_tax = load_bench_taxonomy()

about, data_cat, data_survey, data_taxonomy, bench_cat, bench_survey, bench_taxonomy = st.tabs(["PL ASR survey", "PL ASR Speech Data **Catalog**", "PL ASR Speech data **Survey**", "ASR Speech Data **Taxonomy**", "PL ASR Benchmarks Catalog", "PL ASR Benchmarks Survey", "ASR Benchmarks Taxonomy"])


with about:
    st.title("About Polish ASR Survey")
    st.markdown(INFO_MAIN, unsafe_allow_html=True)

    st.header("How to cite this resource?")
    st.markdown(CITATION_MAIN, unsafe_allow_html=True)

with data_cat:
    st.title("Polish ASR Speech Datasets Catalog")

    st.markdown(INFO_CATALOG, unsafe_allow_html=True)

    st.header("How to use?")
    st.markdown(HOWTO_CATALOG, unsafe_allow_html=True)
    
    # Display catalog contents
    st.header("Browse the catalog content")
    st.dataframe(filter_dataframe(df_data_cat, "datasets"), hide_index=True, use_container_width=True)

    st.header("How to cite this resource?")
    st.markdown(CITATION_CATALOG, unsafe_allow_html=True)


with data_survey:
    # Display summary statistics
    st.title("Polish ASR Speech Datasets Survey")

    st.header("Polish ASR speech datasets summary statistics")
    df_summary_metrics = catalog_summary_statistics(df_data_cat)

    df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
    
    st.dataframe(df_basic_stats, use_container_width=False)

    st.header("Speech data available across Polish ASR speech datasets")
    df_stats_audio_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[5:10]]
    st.dataframe(df_stats_audio_available, use_container_width=False)

    st.header("Transcribed data available across Polish ASR speech datasets")
    df_stats_transcribed_available = df_summary_metrics.loc[BASE_SUMMARY_METRICS[10:15]]
    st.dataframe(df_stats_transcribed_available, use_container_width=False)


    # Display distribution of datasets created per year
    st.header("Polish ASR speech datasets created in 1997-2023")
    col_groupby = ['Creation year']
    df_datasets_per_year = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])

    st.dataframe(df_datasets_per_year, use_container_width=False)

    st.header("Institutions contributing Polish ASR speech datasets")
    col_groupby = ['Publisher']
    df_datasets_per_publisher = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    st.dataframe(df_datasets_per_publisher, use_container_width=False)

    st.header("Institutions contributing freely available Polish ASR speech datasets")
    col_groupby = ['Publisher']
    df_datasets_per_publisher_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    st.dataframe(df_datasets_per_publisher_free, use_container_width=False)


    st.header("Repositories hosting Polish ASR speech datasets")
    col_groupby = ['Repository']
    df_datasets_per_repo = datasets_count_and_size(df_data_cat, col_groupby, col_sort='Count Dataset ID', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    st.dataframe(df_datasets_per_repo, use_container_width=False)

    st.header("Public domain Polish ASR speech datasets")
    col_groupby = ['License', "Dataset ID"]
    df_datasets_public = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
    st.dataframe(df_datasets_public, use_container_width=False)

    st.header("Commercialy available Polish ASR speech datasets")
    col_groupby = ['License', "Dataset ID"]
    df_datasets_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort='License', col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = [])
    st.dataframe(df_datasets_paid, use_container_width=False)

    st.header("Coverage of metadata across Polish ASR speech datasets")
    df_meta_all_flat, df_meta_all_pivot = metadata_coverage(df_data_cat, df_data_cat_available_free, df_data_cat_available_paid)
    st.dataframe(df_meta_all_pivot, use_container_width=False)

    # Display distribution of datasets for various speech types
    st.header("Datasets per speech type")
    col_groupby = ['Speech type']
    df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    # sort by the size of audio transcribed
    df_datasets_per_speech_type = df_datasets_per_speech_type.sort_values(by='Size audio transcribed [hours]', ascending=False)
    st.dataframe(df_datasets_per_speech_type, use_container_width=False)


    # Display distribution of datasets for various speech types
    st.header("Distribution of available speech data per audio device - All available datasets")
    col_groupby = ['Audio device']
    df_datasets_per_device_all = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    # sort by the size of audio transcribed
    df_datasets_per_device_all = df_datasets_per_device_all.sort_values(by='Size audio transcribed [hours]', ascending=False)
    st.dataframe(df_datasets_per_device_all, use_container_width=False)

    # Display distribution of datasets for various speech types
    st.header("Distribution of available speech data per audio device - Public domain datasets")
    col_groupby = ['Audio device']
    df_datasets_per_device_free = datasets_count_and_size(df_data_cat_available_free, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    # sort by the size of audio transcribed
    df_datasets_per_device_free = df_datasets_per_device_free.sort_values(by='Size audio transcribed [hours]', ascending=False)
    st.dataframe(df_datasets_per_device_free, use_container_width=False)

    # Display distribution of datasets for various speech types
    st.header("Distribution of available speech data per audio device - Commercial datasets")
    col_groupby = ['Audio device']
    df_datasets_per_device_paid = datasets_count_and_size(df_data_cat_available_paid, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    # sort by the size of audio transcribed
    df_datasets_per_device_paid = df_datasets_per_device_paid.sort_values(by='Size audio transcribed [hours]', ascending=False)
    st.dataframe(df_datasets_per_device_paid, use_container_width=False)

    # Display distribution of datasets for various speech types
    st.header("Datasets per sampling rate")
    col_groupby = ['Sampling rate [Hz]']
    df_datasets_per_sr = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent = ['Size audio transcribed [hours]'], col_sum = ['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
    # sort by the size of audio transcribed
    df_datasets_per_sr = df_datasets_per_sr.sort_values(by='Size audio transcribed [hours]', ascending=False)
    st.dataframe(df_datasets_per_sr, use_container_width=False)


with data_taxonomy:
    st.title("Polish ASR Speech Data Taxonomy")
    st.header("How to use?")
    st.markdown(HOWTO_TAXONOMY_CAT, unsafe_allow_html=True)
    st.dataframe(df_data_tax, hide_index=True, use_container_width=True)
    st.header("How to cite?")
    st.markdown(CITATION_CATALOG, unsafe_allow_html=True)


with bench_cat:
    st.write("Benchmarks catalog")
    # TODO - load and display benchmarks catalog
    st.title("Polish ASR Benchmarks Catalog")


    # Display catalog contents
    st.dataframe(filter_dataframe(df_bench_cat, "benchmarks"), hide_index=True, use_container_width=True)

    # Display taxonomy contents