mj-new commited on
Commit
4eee292
·
1 Parent(s): d9c6196

Replaced no-info with None values

Browse files
Files changed (4) hide show
  1. __pycache__/utils.cpython-310.pyc +0 -0
  2. app.py +4 -2
  3. requirements.txt +2 -1
  4. utils.py +36 -7
__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
 
app.py CHANGED
@@ -6,6 +6,7 @@ from contants import INFO_CATALOG, CITATION_CATALOG, HOWTO_CATALOG,INFO_BENCHMAR
6
  from utils import BASE_SUMMARY_METRICS
7
  from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
 
9
 
10
  import matplotlib.pyplot as plt
11
  import seaborn as sns
@@ -66,6 +67,7 @@ with data_survey:
66
  df_summary_metrics = catalog_summary_statistics(df_data_cat)
67
 
68
  df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
 
69
  st.dataframe(df_basic_stats, use_container_width=False)
70
 
71
  st.header("Speech data available across Polish ASR speech datasets")
@@ -80,9 +82,9 @@ with data_survey:
80
  # Display distribution of datasets created per year
81
  st.header("Polish ASR speech datasets created in 1997-2023")
82
  col_groupby = ['Creation year']
83
- df_datasets_per_speech_type = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
84
 
85
- st.dataframe(df_datasets_per_speech_type, use_container_width=False)
86
 
87
  st.header("Institutions contributing Polish ASR speech dataset")
88
  col_groupby = ['Publisher']
 
6
  from utils import BASE_SUMMARY_METRICS
7
  from utils import load_data_catalog, load_data_taxonomy, load_bench_catalog, load_bench_taxonomy
8
  from utils import datasets_count_and_size, datasets_count_and_size_standard, metadata_coverage, catalog_summary_statistics
9
+ from utils import left_align, right_align
10
 
11
  import matplotlib.pyplot as plt
12
  import seaborn as sns
 
67
  df_summary_metrics = catalog_summary_statistics(df_data_cat)
68
 
69
  df_basic_stats = df_summary_metrics.loc[BASE_SUMMARY_METRICS[0:5]]
70
+
71
  st.dataframe(df_basic_stats, use_container_width=False)
72
 
73
  st.header("Speech data available across Polish ASR speech datasets")
 
82
  # Display distribution of datasets created per year
83
  st.header("Polish ASR speech datasets created in 1997-2023")
84
  col_groupby = ['Creation year']
85
+ df_datasets_per_year = datasets_count_and_size(df_data_cat, col_groupby, col_sort=col_groupby, col_percent=None, col_sum=['Size audio transcribed [hours]','Audio recordings', 'Speakers'], col_count = ['Dataset ID'])
86
 
87
+ st.dataframe(df_datasets_per_year, use_container_width=False)
88
 
89
  st.header("Institutions contributing Polish ASR speech dataset")
90
  col_groupby = ['Publisher']
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  seaborn
2
  matplotlib
3
- pandas
 
 
1
  seaborn
2
  matplotlib
3
+ pandas
4
+ librosa
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import requests
2
  import pandas as pd
3
  import streamlit as st
 
4
 
5
  catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
6
  # TODO - extract from the catalog name
@@ -30,13 +31,14 @@ def download_tsv_from_google_sheet(sheet_url):
30
 
31
  # Send a GET request to download the TSV file
32
  response = requests.get(tsv_url)
33
-
 
34
  # Check if the request was successful
35
  if response.status_code == 200:
36
  # Read the TSV content into a pandas DataFrame
37
  from io import StringIO
38
  tsv_content = StringIO(response.text)
39
- df = pd.read_csv(tsv_content, sep='\t')
40
  return df
41
  else:
42
  print("Failed to download the TSV file.")
@@ -71,6 +73,22 @@ def load_bench_taxonomy():
71
  df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
72
  return(df_taxonomy)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
76
  """
@@ -144,11 +162,13 @@ def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None
144
  # Sort by the provided column col_sort
145
  col_sort = col_groupby if col_sort is None else col_sort
146
  summary.sort_values(by=col_sort, ascending=False, inplace=True)
147
-
148
- # Replace 0 with no-info in columns with sum
149
  for col in col_sum:
150
- summary[col] = summary[col].replace(0, 'no-info')
151
-
 
 
152
  return summary
153
 
154
 
@@ -210,6 +230,9 @@ def metadata_coverage(df_cat, df_cat_available_free, df_cat_available_paid):
210
  df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
211
  df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
212
 
 
 
 
213
  return(df_meta_all_flat, df_meta_all_pivot)
214
 
215
 
@@ -289,4 +312,10 @@ def catalog_summary_statistics(df_cat):
289
  metrics_df = pd.DataFrame(metrics_dict)
290
  metrics_df.reset_index(drop=True, inplace=True)
291
  metrics_df.set_index("Metric", inplace=True)
292
- return(metrics_df)
 
 
 
 
 
 
 
1
  import requests
2
  import pandas as pd
3
  import streamlit as st
4
+ import numpy as np
5
 
6
  catalog_last_update_date = pd.to_datetime('today').strftime('%Y-%m-%d')
7
  # TODO - extract from the catalog name
 
31
 
32
  # Send a GET request to download the TSV file
33
  response = requests.get(tsv_url)
34
+ response.encoding = 'utf-8'
35
+
36
  # Check if the request was successful
37
  if response.status_code == 200:
38
  # Read the TSV content into a pandas DataFrame
39
  from io import StringIO
40
  tsv_content = StringIO(response.text)
41
+ df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
42
  return df
43
  else:
44
  print("Failed to download the TSV file.")
 
73
  df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
74
  return(df_taxonomy)
75
 
76
+ def style_floats(val):
77
+ """
78
+ Converts float to int if the fractional part is zero, formats floats with two decimal places,
79
+ and leaves strings unchanged.
80
+ """
81
+ # Check if value is a float and if it can be converted to an int without loss
82
+ if isinstance(val, float):
83
+ if val % 1 == 0:
84
+ return f"{int(val)}" # Convert float with no fractional part to int
85
+ else:
86
+ return f"{val:.2f}" # Format floats with two decimal places
87
+ elif isinstance(val, int):
88
+ return f"{val}" # Handle pure integers separately (though likely unnecessary)
89
+ else:
90
+ return val # Return strings unchanged
91
+
92
 
93
  def datasets_count_and_size(df_cat, col_groupby, col_sort=None, col_percent=None, col_sum=['Size audio transcribed [hours]'], col_count=['Dataset ID']):
94
  """
 
162
  # Sort by the provided column col_sort
163
  col_sort = col_groupby if col_sort is None else col_sort
164
  summary.sort_values(by=col_sort, ascending=False, inplace=True)
165
+
166
+ print(col_sum)
167
  for col in col_sum:
168
+ print(col)
169
+ #summary[col] = summary[col].apply(lambda x: str(int(x)) if float(x).is_integer() else str(x))
170
+ summary[col] = summary[col].replace(0, np.nan)
171
+
172
  return summary
173
 
174
 
 
230
  df_meta_all_pivot = df_meta_all_pivot.pivot(index='Metadata', columns='Type', values=[col_name_count, col_name_sum_size, col_name_percent])
231
  df_meta_all_pivot[col_name_count]=df_meta_all_pivot[col_name_count].astype(int)
232
 
233
+ #df_meta_all_pivot_styled = df_meta_all_pivot.style.map(style_floats)
234
+ #df_meta_all_flat_styled = df_meta_all_flat.style.map(style_floats)
235
+
236
  return(df_meta_all_flat, df_meta_all_pivot)
237
 
238
 
 
312
  metrics_df = pd.DataFrame(metrics_dict)
313
  metrics_df.reset_index(drop=True, inplace=True)
314
  metrics_df.set_index("Metric", inplace=True)
315
+ return(metrics_df)
316
+
317
+ def right_align(s, props='text-align: right;'):
318
+ return props
319
+
320
+ def left_align(s, props='text-align: left;'):
321
+ return props