Spaces:

owiedotch
/

dac

Sleeping

App Files Files Community

owiedotch commited on Aug 26

Commit

fea8244

•

1 Parent(s): cefd33c

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -5

app.py CHANGED Viewed

@@ -62,6 +62,10 @@ def encode_audio(audio_file_path):
         with open(temp_file_path, 'wb') as temp_file:
             # Write sample rate
             temp_file.write(sample_rate.to_bytes(4, byteorder='little'))
             # Compress and write the tokens data
             compressed_data = lz4.frame.compress(tokens_numpy.tobytes())
             temp_file.write(compressed_data)
@@ -91,9 +95,11 @@ def decode_audio(encoded_file_path):
         # Load encoded data and sample rate
         with open(encoded_file_path, 'rb') as temp_file:
             sample_rate = int.from_bytes(temp_file.read(4), byteorder='little')
             compressed_data = temp_file.read()
             tokens_numpy_bytes = lz4.frame.decompress(compressed_data)
-            tokens_numpy = np.frombuffer(tokens_numpy_bytes, dtype=np.int64).reshape(-1)
             # Move the tensor to the same device as the model
             tokens = torch.from_numpy(tokens_numpy).to(device=semanticodec.device)
@@ -126,9 +132,11 @@ async def stream_decode_audio(encoded_file_path) -> Generator[tuple, None, None]
         # Load encoded data and sample rate from the .owie file
         with open(encoded_file_path, 'rb') as temp_file:
             sample_rate = int.from_bytes(temp_file.read(4), byteorder='little')
             compressed_data = temp_file.read()
             tokens_numpy_bytes = lz4.frame.decompress(compressed_data)
-            tokens_numpy = np.frombuffer(tokens_numpy_bytes, dtype=np.int64).reshape(-1)
             # Move the tensor to the same device as the model
             tokens = torch.from_numpy(tokens_numpy).to(device=semanticodec.device)
@@ -137,13 +145,13 @@ async def stream_decode_audio(encoded_file_path) -> Generator[tuple, None, None]
         print(f"Model device: {semanticodec.device}")
         # Decode the audio in chunks
-        chunk_size = sample_rate * 2  # Adjust chunk size as needed
         with torch.no_grad():
-            for i in range(0, tokens.shape[0], chunk_size):
                 if cancel_stream:
                     break  # Exit the loop if cancellation is requested
-                tokens_chunk = tokens[i:i+chunk_size]
                 audio_chunk = semanticodec.decode(tokens_chunk)
                 # Convert to numpy array and transpose
                 audio_data = audio_chunk.squeeze(0).cpu().numpy().T

         with open(temp_file_path, 'wb') as temp_file:
             # Write sample rate
             temp_file.write(sample_rate.to_bytes(4, byteorder='little'))
+            # Write shape information
+            temp_file.write(len(tokens_numpy.shape).to_bytes(4, byteorder='little'))
+            for dim in tokens_numpy.shape:
+                temp_file.write(dim.to_bytes(4, byteorder='little'))
             # Compress and write the tokens data
             compressed_data = lz4.frame.compress(tokens_numpy.tobytes())
             temp_file.write(compressed_data)
         # Load encoded data and sample rate
         with open(encoded_file_path, 'rb') as temp_file:
             sample_rate = int.from_bytes(temp_file.read(4), byteorder='little')
+            ndim = int.from_bytes(temp_file.read(4), byteorder='little')
+            shape = tuple(int.from_bytes(temp_file.read(4), byteorder='little') for _ in range(ndim))
             compressed_data = temp_file.read()
             tokens_numpy_bytes = lz4.frame.decompress(compressed_data)
+            tokens_numpy = np.frombuffer(tokens_numpy_bytes, dtype=np.int64).reshape(shape)
             # Move the tensor to the same device as the model
             tokens = torch.from_numpy(tokens_numpy).to(device=semanticodec.device)
         # Load encoded data and sample rate from the .owie file
         with open(encoded_file_path, 'rb') as temp_file:
             sample_rate = int.from_bytes(temp_file.read(4), byteorder='little')
+            ndim = int.from_bytes(temp_file.read(4), byteorder='little')
+            shape = tuple(int.from_bytes(temp_file.read(4), byteorder='little') for _ in range(ndim))
             compressed_data = temp_file.read()
             tokens_numpy_bytes = lz4.frame.decompress(compressed_data)
+            tokens_numpy = np.frombuffer(tokens_numpy_bytes, dtype=np.int64).reshape(shape)
             # Move the tensor to the same device as the model
             tokens = torch.from_numpy(tokens_numpy).to(device=semanticodec.device)
         print(f"Model device: {semanticodec.device}")
         # Decode the audio in chunks
+        chunk_size = sample_rate // 2  # Adjust chunk size to account for the new shape
         with torch.no_grad():
+            for i in range(0, tokens.shape[1], chunk_size):
                 if cancel_stream:
                     break  # Exit the loop if cancellation is requested
+                tokens_chunk = tokens[:, i:i+chunk_size, :]
                 audio_chunk = semanticodec.decode(tokens_chunk)
                 # Convert to numpy array and transpose
                 audio_data = audio_chunk.squeeze(0).cpu().numpy().T