Spaces:

amphion
/

maskgct

Running on Zero

App Files Files Community

maskgct / models /codec /speechtokenizer /modules /quantization /ac.py

Hecheng0625

Upload 409 files

c968fc3 verified 20 days ago

raw

history blame contribute delete

13.3 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	# This source file is copied from https://github.com/facebookresearch/encodec

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	"""Arithmetic coder."""

	import io
	import math
	import random
	import typing as tp
	import torch

	from ..binary import BitPacker, BitUnpacker


	def build_stable_quantized_cdf(
	pdf: torch.Tensor,
	total_range_bits: int,
	roundoff: float = 1e-8,
	min_range: int = 2,
	check: bool = True,
	) -> torch.Tensor:
	"""Turn the given PDF into a quantized CDF that splits
	[0, 2 ** self.total_range_bits - 1] into chunks of size roughly proportional
	to the PDF.

	Args:
	pdf (torch.Tensor): probability distribution, shape should be `[N]`.
	total_range_bits (int): see `ArithmeticCoder`, the typical range we expect
	during the coding process is `[0, 2 ** total_range_bits - 1]`.
	roundoff (float): will round the pdf up to that level to remove difference coming
	from e.g. evaluating the Language Model on different architectures.
	min_range (int): minimum range width. Should always be at least 2 for numerical
	stability. Use this to avoid pathological behavior is a value
	that is expected to be rare actually happens in real life.
	check (bool): if True, checks that nothing bad happened, can be deactivated for speed.
	"""
	pdf = pdf.detach()
	if roundoff:
	pdf = (pdf / roundoff).floor() * roundoff
	# interpolate with uniform distribution to achieve desired minimum probability.
	total_range = 2**total_range_bits
	cardinality = len(pdf)
	alpha = min_range * cardinality / total_range
	assert alpha <= 1, "you must reduce min_range"
	ranges = (((1 - alpha) * total_range) * pdf).floor().long()
	ranges += min_range
	quantized_cdf = torch.cumsum(ranges, dim=-1)
	if min_range < 2:
	raise ValueError("min_range must be at least 2.")
	if check:
	assert quantized_cdf[-1] <= 2**total_range_bits, quantized_cdf[-1]
	if (
	(quantized_cdf[1:] - quantized_cdf[:-1]) < min_range
	).any() or quantized_cdf[0] < min_range:
	raise ValueError("You must increase your total_range_bits.")
	return quantized_cdf


	class ArithmeticCoder:
	"""ArithmeticCoder,
	Let us take a distribution `p` over `N` symbols, and assume we have a stream
	of random variables `s_t` sampled from `p`. Let us assume that we have a budget
	of `B` bits that we can afford to write on device. There are `2**B` possible numbers,
	corresponding to the range `[0, 2 ** B - 1]`. We can map each of those number to a single
	sequence `(s_t)` by doing the following:

	1) Initialize the current range to` [0 ** 2 B - 1]`.
	2) For each time step t, split the current range into contiguous chunks,
	one for each possible outcome, with size roughly proportional to `p`.
	For instance, if `p = [0.75, 0.25]`, and the range is `[0, 3]`, the chunks
	would be `{[0, 2], [3, 3]}`.
	3) Select the chunk corresponding to `s_t`, and replace the current range with this.
	4) When done encoding all the values, just select any value remaining in the range.

	You will notice that this procedure can fail: for instance if at any point in time
	the range is smaller than `N`, then we can no longer assign a non-empty chunk to each
	possible outcome. Intuitively, the more likely a value is, the less the range width
	will reduce, and the longer we can go on encoding values. This makes sense: for any efficient
	coding scheme, likely outcomes would take less bits, and more of them can be coded
	with a fixed budget.

	In practice, we do not know `B` ahead of time, but we have a way to inject new bits
	when the current range decreases below a given limit (given by `total_range_bits`), without
	having to redo all the computations. If we encode mostly likely values, we will seldom
	need to inject new bits, but a single rare value can deplete our stock of entropy!

	In this explanation, we assumed that the distribution `p` was constant. In fact, the present
	code works for any sequence `(p_t)` possibly different for each timestep.
	We also assume that `s_t ~ p_t`, but that doesn't need to be true, although the smaller
	the KL between the true distribution and `p_t`, the most efficient the coding will be.

	Args:
	fo (IO[bytes]): file-like object to which the bytes will be written to.
	total_range_bits (int): the range `M` described above is `2 ** total_range_bits.
	Any time the current range width fall under this limit, new bits will
	be injected to rescale the initial range.
	"""

	def __init__(self, fo: tp.IO[bytes], total_range_bits: int = 24):
	assert total_range_bits <= 30
	self.total_range_bits = total_range_bits
	self.packer = BitPacker(bits=1, fo=fo) # we push single bits at a time.
	self.low: int = 0
	self.high: int = 0
	self.max_bit: int = -1
	self._dbg: tp.List[tp.Any] = []
	self._dbg2: tp.List[tp.Any] = []

	@property
	def delta(self) -> int:
	"""Return the current range width."""
	return self.high - self.low + 1

	def _flush_common_prefix(self):
	# If self.low and self.high start with the sames bits,
	# those won't change anymore as we always just increase the range
	# by powers of 2, and we can flush them out to the bit stream.
	assert self.high >= self.low, (self.low, self.high)
	assert self.high < 2 ** (self.max_bit + 1)
	while self.max_bit >= 0:
	b1 = self.low >> self.max_bit
	b2 = self.high >> self.max_bit
	if b1 == b2:
	self.low -= b1 << self.max_bit
	self.high -= b1 << self.max_bit
	assert self.high >= self.low, (self.high, self.low, self.max_bit)
	assert self.low >= 0
	self.max_bit -= 1
	self.packer.push(b1)
	else:
	break

	def push(self, symbol: int, quantized_cdf: torch.Tensor):
	"""Push the given symbol on the stream, flushing out bits
	if possible.

	Args:
	symbol (int): symbol to encode with the AC.
	quantized_cdf (torch.Tensor): use `build_stable_quantized_cdf`
	to build this from your pdf estimate.
	"""
	while self.delta < 2**self.total_range_bits:
	self.low *= 2
	self.high = self.high * 2 + 1
	self.max_bit += 1

	range_low = 0 if symbol == 0 else quantized_cdf[symbol - 1].item()
	range_high = quantized_cdf[symbol].item() - 1
	effective_low = int(
	math.ceil(range_low * (self.delta / (2**self.total_range_bits)))
	)
	effective_high = int(
	math.floor(range_high * (self.delta / (2**self.total_range_bits)))
	)
	assert self.low <= self.high
	self.high = self.low + effective_high
	self.low = self.low + effective_low
	assert self.low <= self.high, (
	effective_low,
	effective_high,
	range_low,
	range_high,
	)
	self._dbg.append((self.low, self.high))
	self._dbg2.append((self.low, self.high))
	outs = self._flush_common_prefix()
	assert self.low <= self.high
	assert self.max_bit >= -1
	assert self.max_bit <= 61, self.max_bit
	return outs

	def flush(self):
	"""Flush the remaining information to the stream."""
	while self.max_bit >= 0:
	b1 = (self.low >> self.max_bit) & 1
	self.packer.push(b1)
	self.max_bit -= 1
	self.packer.flush()


	class ArithmeticDecoder:
	"""ArithmeticDecoder, see `ArithmeticCoder` for a detailed explanation.

	Note that this must be called with exactly the same parameters and sequence
	of quantized cdf as the arithmetic encoder or the wrong values will be decoded.

	If the AC encoder current range is [L, H], with `L` and `H` having the some common
	prefix (i.e. the same most significant bits), then this prefix will be flushed to the stream.
	For instances, having read 3 bits `b1 b2 b3`, we know that `[L, H]` is contained inside
	`[b1 b2 b3 0 ... 0 b1 b3 b3 1 ... 1]`. Now this specific sub-range can only be obtained
	for a specific sequence of symbols and a binary-search allows us to decode those symbols.
	At some point, the prefix `b1 b2 b3` will no longer be sufficient to decode new symbols,
	and we will need to read new bits from the stream and repeat the process.

	"""

	def __init__(self, fo: tp.IO[bytes], total_range_bits: int = 24):
	self.total_range_bits = total_range_bits
	self.low: int = 0
	self.high: int = 0
	self.current: int = 0
	self.max_bit: int = -1
	self.unpacker = BitUnpacker(bits=1, fo=fo) # we pull single bits at a time.
	# Following is for debugging
	self._dbg: tp.List[tp.Any] = []
	self._dbg2: tp.List[tp.Any] = []
	self._last: tp.Any = None

	@property
	def delta(self) -> int:
	return self.high - self.low + 1

	def _flush_common_prefix(self):
	# Given the current range [L, H], if both have a common prefix,
	# we know we can remove it from our representation to avoid handling large numbers.
	while self.max_bit >= 0:
	b1 = self.low >> self.max_bit
	b2 = self.high >> self.max_bit
	if b1 == b2:
	self.low -= b1 << self.max_bit
	self.high -= b1 << self.max_bit
	self.current -= b1 << self.max_bit
	assert self.high >= self.low
	assert self.low >= 0
	self.max_bit -= 1
	else:
	break

	def pull(self, quantized_cdf: torch.Tensor) -> tp.Optional[int]:
	"""Pull a symbol, reading as many bits from the stream as required.
	This returns `None` when the stream has been exhausted.

	Args:
	quantized_cdf (torch.Tensor): use `build_stable_quantized_cdf`
	to build this from your pdf estimate. This must be exatly
	the same cdf as the one used at encoding time.
	"""
	while self.delta < 2**self.total_range_bits:
	bit = self.unpacker.pull()
	if bit is None:
	return None
	self.low *= 2
	self.high = self.high * 2 + 1
	self.current = self.current * 2 + bit
	self.max_bit += 1

	def bin_search(low_idx: int, high_idx: int):
	# Binary search is not just for coding interviews :)
	if high_idx < low_idx:
	raise RuntimeError("Binary search failed")
	mid = (low_idx + high_idx) // 2
	range_low = quantized_cdf[mid - 1].item() if mid > 0 else 0
	range_high = quantized_cdf[mid].item() - 1
	effective_low = int(
	math.ceil(range_low * (self.delta / (2**self.total_range_bits)))
	)
	effective_high = int(
	math.floor(range_high * (self.delta / (2**self.total_range_bits)))
	)
	low = effective_low + self.low
	high = effective_high + self.low
	if self.current >= low:
	if self.current <= high:
	return (mid, low, high, self.current)
	else:
	return bin_search(mid + 1, high_idx)
	else:
	return bin_search(low_idx, mid - 1)

	self._last = (self.low, self.high, self.current, self.max_bit)
	sym, self.low, self.high, self.current = bin_search(0, len(quantized_cdf) - 1)
	self._dbg.append((self.low, self.high, self.current))
	self._flush_common_prefix()
	self._dbg2.append((self.low, self.high, self.current))

	return sym


	def test():
	torch.manual_seed(1234)
	random.seed(1234)
	for _ in range(4):
	pdfs = []
	cardinality = random.randrange(4000)
	steps = random.randrange(100, 500)
	fo = io.BytesIO()
	encoder = ArithmeticCoder(fo)
	symbols = []
	for step in range(steps):
	pdf = torch.softmax(torch.randn(cardinality), dim=0)
	pdfs.append(pdf)
	q_cdf = build_stable_quantized_cdf(pdf, encoder.total_range_bits)
	symbol = torch.multinomial(pdf, 1).item()
	symbols.append(symbol)
	encoder.push(symbol, q_cdf)
	encoder.flush()

	fo.seek(0)
	decoder = ArithmeticDecoder(fo)
	for idx, (pdf, symbol) in enumerate(zip(pdfs, symbols)):
	q_cdf = build_stable_quantized_cdf(pdf, encoder.total_range_bits)
	decoded_symbol = decoder.pull(q_cdf)
	assert decoded_symbol == symbol, idx
	assert decoder.pull(torch.zeros(1)) is None


	if __name__ == "__main__":
	test()