Spaces:

Labbeti
/

conette

Running

App Files Files Community

Labbeti commited on Jan 15

Commit

5f47c66

•

1 Parent(s): ae94a43

Mod: Update UI to store microphone input in microphone_conette_record.wav file, raises an error when the audio is too short or too long, update main description and show other candidates in outputs.

Browse files

Files changed (2) hide show

.gitignore +1 -1
app.py +63 -26

.gitignore CHANGED Viewed

	@@ -1 +1 @@
1	- ~~record~~.wav


1	+ microphone_conette_record.wav

app.py CHANGED Viewed

@@ -2,12 +2,14 @@
 # -*- coding: utf-8 -*-
 from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
-from typing import Any, Optional
 import streamlit as st
 from st_audiorec import st_audiorec
 from streamlit.runtime.uploaded_file_manager import UploadedFile
 from conette import CoNeTTEModel, conette
 from conette.utils.collections import dict_list_to_list_dict
@@ -17,9 +19,11 @@ ALLOW_REP_MODES = ("stopwords", "all", "none")
 MAX_BEAM_SIZE = 20
 MAX_PRED_SIZE = 30
 MAX_BATCH_SIZE = 32
-RECORD_AUDIO_FNAME = "record.wav"
 DEFAULT_THRESHOLD = 0.3
 THRESHOLD_PRECISION = 100
 @st.cache_resource
@@ -49,20 +53,34 @@ def get_results(
     model: CoNeTTEModel,
     audio_files: dict[str, bytes],
     generate_kwds: dict[str, Any],
-) -> dict[str, dict[str, Any]]:
     # Get audio to be processed
-    audio_to_predict: dict[str, bytes] = {}
     for audio_fname, audio in audio_files.items():
         result_hash = get_result_hash(audio_fname, generate_kwds)
         if result_hash not in st.session_state or audio_fname == RECORD_AUDIO_FNAME:
-            audio_to_predict[result_hash] = audio
     # Save audio to be processed
     tmp_files: dict[str, _TemporaryFileWrapper] = {}
-    for result_hash, audio in audio_to_predict.items():
-        tmp_file = NamedTemporaryFile()
         tmp_file.write(audio)
-        tmp_files[result_hash] = tmp_file
     # Generate predictions and store them in session state
     for start in range(0, len(tmp_files), MAX_BATCH_SIZE):
@@ -74,8 +92,6 @@ def get_results(
             tmp_paths_j,
             **generate_kwds,
         )
-        for tmp_file in tmp_files_j:
-            tmp_file.close()
         outputs_lst = dict_list_to_list_dict(outputs_j)  # type: ignore
         for result_hash, output_i in zip(result_hashes_j, outputs_lst):
             st.session_state[result_hash] = output_i
@@ -90,46 +106,67 @@ def get_results(
     return outputs
-def show_results(outputs: dict[str, dict[str, Any]]) -> None:
     st.divider()
     for audio_fname, output in outputs.items():
-        cand = output["cands"]
-        lprobs = output["lprobs"]
-        tags = output.get("tags")
         cand = format_candidate(cand)
-        tags = format_tags(tags)
         prob = lprobs.exp().tolist()
         if audio_fname == RECORD_AUDIO_FNAME:
             header = "##### Result for microphone input:"
         else:
             header = f'##### Result for "{audio_fname}"'
-        content = f"""
-        {header}
-        - **Description:** "{cand}"
-        - **Mean confidence:** {prob*100:.0f}%
-        - **Tags:** {tags}"""
-        st.markdown(content)
         st.divider()
 def main() -> None:
-    st.header("Describe audio content with CoNeTTE")
     model = load_conette(model_kwds=dict(device="cpu"))
-    # st.warning(
-    #     "Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum."
-    # )
     record_data = st_audiorec()
     audio_files: Optional[list[UploadedFile]] = st.file_uploader(
         "**Or upload audio files here:**",
         type=["wav", "flac", "mp3", "ogg", "avi"],
         accept_multiple_files=True,
     )
     with st.expander("Model hyperparameters"):

 # -*- coding: utf-8 -*-
 from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
+from typing import Any, Optional, Union
 import streamlit as st
+import torchaudio
 from st_audiorec import st_audiorec
 from streamlit.runtime.uploaded_file_manager import UploadedFile
+from torch import Tensor
 from conette import CoNeTTEModel, conette
 from conette.utils.collections import dict_list_to_list_dict
 MAX_BEAM_SIZE = 20
 MAX_PRED_SIZE = 30
 MAX_BATCH_SIZE = 32
+RECORD_AUDIO_FNAME = "microphone_conette_record.wav"
 DEFAULT_THRESHOLD = 0.3
 THRESHOLD_PRECISION = 100
+MIN_AUDIO_DURATION_SEC = 0.3
+MAX_AUDIO_DURATION_SEC = 60
 @st.cache_resource
     model: CoNeTTEModel,
     audio_files: dict[str, bytes],
     generate_kwds: dict[str, Any],
+) -> dict[str, Union[dict[str, Any], str]]:
     # Get audio to be processed
+    audio_to_predict: dict[str, tuple[str, bytes]] = {}
     for audio_fname, audio in audio_files.items():
         result_hash = get_result_hash(audio_fname, generate_kwds)
         if result_hash not in st.session_state or audio_fname == RECORD_AUDIO_FNAME:
+            audio_to_predict[result_hash] = (audio_fname, audio)
     # Save audio to be processed
     tmp_files: dict[str, _TemporaryFileWrapper] = {}
+    for result_hash, (audio_fname, audio) in audio_to_predict.items():
+        tmp_file = NamedTemporaryFile(delete=False)
         tmp_file.write(audio)
+        tmp_file.close()
+        metadata = torchaudio.info(tmp_file.name)  # type: ignore
+        duration = metadata.num_frames / metadata.sample_rate
+        if MIN_AUDIO_DURATION_SEC > duration:
+            error_msg = f"Audio file is too short. (found {duration:.2f}s but the model expect audio in range [{MIN_AUDIO_DURATION_SEC}, {MAX_AUDIO_DURATION_SEC}])"
+            st.session_state[result_hash] = error_msg
+        elif duration > MAX_AUDIO_DURATION_SEC:
+            error_msg = f"Audio file is too long. (found {duration:.2f}s but the model expect audio in range [{MIN_AUDIO_DURATION_SEC}, {MAX_AUDIO_DURATION_SEC}])"
+            st.session_state[result_hash] = error_msg
+        else:
+            tmp_files[result_hash] = tmp_file
     # Generate predictions and store them in session state
     for start in range(0, len(tmp_files), MAX_BATCH_SIZE):
             tmp_paths_j,
             **generate_kwds,
         )
         outputs_lst = dict_list_to_list_dict(outputs_j)  # type: ignore
         for result_hash, output_i in zip(result_hashes_j, outputs_lst):
             st.session_state[result_hash] = output_i
     return outputs
+def show_results(outputs: dict[str, Union[dict[str, Any], str]]) -> None:
     st.divider()
     for audio_fname, output in outputs.items():
+        if isinstance(output, str):
+            st.error(output)
+            st.divider()
+            continue
+        cand: str = output["cands"]
+        lprobs: Tensor = output["lprobs"]
+        tags_lst = output.get("tags")
+        mult_cands: list[str] = output["mult_cands"]
+        mult_lprobs: Tensor = output["mult_lprobs"]
         cand = format_candidate(cand)
         prob = lprobs.exp().tolist()
+        tags = format_tags(tags_lst)
+        mult_cands = [format_candidate(cand_i) for cand_i in mult_cands]
+        mult_probs = mult_lprobs.exp()
+        indexes = mult_probs.argsort(descending=True)[1:]
+        mult_probs = mult_probs[indexes].tolist()
+        mult_cands = [mult_cands[idx] for idx in indexes]
         if audio_fname == RECORD_AUDIO_FNAME:
             header = "##### Result for microphone input:"
         else:
             header = f'##### Result for "{audio_fname}"'
+        content = [
+            header,
+            f'- **Description:** "{cand}" ({prob*100:.1f}%)',
+            f"- **Tags:** {tags}",
+        ]
+        if len(mult_cands) > 0:
+            msg = f"- **Other descriptions:**"
+            content.append(msg)
+        for cand_i, prob_i in zip(mult_cands, mult_probs):
+            msg = f'  - "{cand_i}" ({prob_i*100:.1f}%)'
+            content.append(msg)
+        st.success("\n".join(content))
         st.divider()
 def main() -> None:
     model = load_conette(model_kwds=dict(device="cpu"))
+    st.header("Describe audio content with CoNeTTE")
+    st.markdown(
+        "This interface allows you to generate a short description of the sound events of any recording. You can try it from your microphone or upload a file below."
+    )
     record_data = st_audiorec()
     audio_files: Optional[list[UploadedFile]] = st.file_uploader(
         "**Or upload audio files here:**",
         type=["wav", "flac", "mp3", "ogg", "avi"],
         accept_multiple_files=True,
+        help="Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum.",
     )
     with st.expander("Model hyperparameters"):