llmixer's picture
Added test inference from exllamav2, added gguf-py from llama.cpp
e47221b
raw
history blame
12.4 kB
#
# GGUF file reading/modification support. For API usage information,
# please see the files scripts/ for some fairly simple examples.
#
from __future__ import annotations
import logging
import os
from collections import OrderedDict
from typing import Any, Literal, NamedTuple, TypeVar, Union
import numpy as np
import numpy.typing as npt
from .quants import quant_shape_to_byte_shape
if __name__ == "__main__":
import sys
from pathlib import Path
# Allow running file in package as a script.
sys.path.insert(0, str(Path(__file__).parent.parent))
from gguf.constants import (
GGML_QUANT_SIZES,
GGUF_DEFAULT_ALIGNMENT,
GGUF_MAGIC,
GGUF_VERSION,
GGMLQuantizationType,
GGUFValueType,
)
logger = logging.getLogger(__name__)
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
class ReaderField(NamedTuple):
# Offset to start of this field.
offset: int
# Name of the field (not necessarily from file data).
name: str
# Data parts. Some types have multiple components, such as strings
# that consist of a length followed by the string data.
parts: list[npt.NDArray[Any]] = []
# Indexes into parts that we can call the actual data. For example
# an array of strings will be populated with indexes to the actual
# string data.
data: list[int] = [-1]
types: list[GGUFValueType] = []
class ReaderTensor(NamedTuple):
name: str
tensor_type: GGMLQuantizationType
shape: npt.NDArray[np.uint32]
n_elements: int
n_bytes: int
data_offset: int
data: npt.NDArray[Any]
field: ReaderField
class GGUFReader:
# I - same as host, S - swapped
byte_order: Literal['I', 'S'] = 'I'
alignment: int = GGUF_DEFAULT_ALIGNMENT
data_offset: int
# Note: Internal helper, API may change.
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
GGUFValueType.UINT8: np.uint8,
GGUFValueType.INT8: np.int8,
GGUFValueType.UINT16: np.uint16,
GGUFValueType.INT16: np.int16,
GGUFValueType.UINT32: np.uint32,
GGUFValueType.INT32: np.int32,
GGUFValueType.FLOAT32: np.float32,
GGUFValueType.UINT64: np.uint64,
GGUFValueType.INT64: np.int64,
GGUFValueType.FLOAT64: np.float64,
GGUFValueType.BOOL: np.bool_,
}
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
self.data = np.memmap(path, mode = mode)
offs = 0
# Check for GGUF magic
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
raise ValueError('GGUF magic invalid')
offs += 4
# Check GGUF version
temp_version = self._get(offs, np.uint32)
if temp_version[0] & 65535 == 0:
# If we get 0 here that means it's (probably) a GGUF file created for
# the opposite byte order of the machine this script is running on.
self.byte_order = 'S'
temp_version = temp_version.newbyteorder(self.byte_order)
version = temp_version[0]
if version not in READER_SUPPORTED_VERSIONS:
raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
self.tensors: list[ReaderTensor] = []
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
# Check tensor count and kv count
temp_counts = self._get(offs, np.uint64, 2)
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
tensor_count, kv_count = temp_counts
offs = self._build_fields(offs, kv_count)
# Build Tensor Info Fields
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
new_align = self.fields.get('general.alignment')
if new_align is not None:
if new_align.types != [GGUFValueType.UINT32]:
raise ValueError('Bad type for general.alignment field')
self.alignment = new_align.parts[-1][0]
padding = offs % self.alignment
if padding != 0:
offs += self.alignment - padding
self.data_offset = offs
self._build_tensors(offs, tensors_fields)
_DT = TypeVar('_DT', bound = npt.DTypeLike)
# Fetch a key/value metadata field by key.
def get_field(self, key: str) -> Union[ReaderField, None]:
return self.fields.get(key, None)
# Fetch a tensor from the list by index.
def get_tensor(self, idx: int) -> ReaderTensor:
return self.tensors[idx]
def _get(
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
) -> npt.NDArray[Any]:
count = int(count)
itemsize = int(np.empty([], dtype = dtype).itemsize)
end_offs = offset + itemsize * count
return (
self.data[offset:end_offs]
.view(dtype = dtype)[:count]
.newbyteorder(override_order or self.byte_order)
)
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields:
# TODO: add option to generate error on duplicate keys
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
self.fields[field.name + '_{}'.format(field.offset)] = field
else:
self.fields[field.name] = field
return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
slen = self._get(offset, np.uint64)
return slen, self._get(offset + 8, np.uint8, slen[0])
def _get_field_parts(
self, orig_offs: int, raw_type: int,
) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]:
offs = orig_offs
types: list[GGUFValueType] = []
gtype = GGUFValueType(raw_type)
types.append(gtype)
# Handle strings.
if gtype == GGUFValueType.STRING:
sparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
size = sum(int(part.nbytes) for part in sparts)
return size, sparts, [1], types
# Check if it's a simple scalar type.
nptype = self.gguf_scalar_to_np.get(gtype)
if nptype is not None:
val = self._get(offs, nptype)
return int(val.nbytes), [val], [0], types
# Handle arrays.
if gtype == GGUFValueType.ARRAY:
raw_itype = self._get(offs, np.uint32)
offs += int(raw_itype.nbytes)
alen = self._get(offs, np.uint64)
offs += int(alen.nbytes)
aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
data_idxs: list[int] = []
for idx in range(alen[0]):
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
if idx == 0:
types += curr_types
idxs_offs = len(aparts)
aparts += curr_parts
data_idxs += (idx + idxs_offs for idx in curr_idxs)
offs += curr_size
return offs - orig_offs, aparts, data_idxs, types
# We can't deal with this one.
raise ValueError('Unknown/unhandled field type {gtype}')
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
offs = orig_offs
# Get Tensor Name
name_len, name_data = self._get_str(offs)
offs += int(name_len.nbytes + name_data.nbytes)
# Get Tensor Dimensions Count
n_dims = self._get(offs, np.uint32)
offs += int(n_dims.nbytes)
# Get Tensor Dimension Array
dims = self._get(offs, np.uint64, n_dims[0])
offs += int(dims.nbytes)
# Get Tensor Encoding Scheme Type
raw_dtype = self._get(offs, np.uint32)
offs += int(raw_dtype.nbytes)
# Get Tensor Offset
offset_tensor = self._get(offs, np.uint64)
offs += int(offset_tensor.nbytes)
return ReaderField(
orig_offs,
str(bytes(name_data), encoding = 'utf-8'),
[name_len, name_data, n_dims, dims, raw_dtype, offset_tensor],
[1, 3, 4, 5],
)
def _build_fields(self, offs: int, count: int) -> int:
for _ in range(count):
orig_offs = offs
kv_klen, kv_kdata = self._get_str(offs)
offs += int(kv_klen.nbytes + kv_kdata.nbytes)
raw_kv_type = self._get(offs, np.uint32)
offs += int(raw_kv_type.nbytes)
parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
idxs_offs = len(parts)
field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
parts += field_parts
self._push_field(ReaderField(
orig_offs,
str(bytes(kv_kdata), encoding = 'utf-8'),
parts,
[idx + idxs_offs for idx in field_idxs],
field_types,
), skip_sum = True)
offs += field_size
return offs
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
tensor_fields = []
for _ in range(count):
field = self._get_tensor_info_field(offs)
offs += sum(int(part.nbytes) for part in field.parts)
tensor_fields.append(field)
return offs, tensor_fields
def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
tensors = []
tensor_names = set() # keep track of name to prevent duplicated tensors
for field in fields:
_name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
# check if there's any tensor having same name already in the list
tensor_name = str(bytes(name_data), encoding = 'utf-8')
if tensor_name in tensor_names:
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = int(np.prod(dims))
np_dims = tuple(reversed(dims.tolist()))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
data_offs = int(start_offs + offset_tensor[0])
item_type: npt.DTypeLike
if ggml_type == GGMLQuantizationType.F16:
item_count = n_elems
item_type = np.float16
elif ggml_type == GGMLQuantizationType.F32:
item_count = n_elems
item_type = np.float32
elif ggml_type == GGMLQuantizationType.F64:
item_count = n_elems
item_type = np.float64
elif ggml_type == GGMLQuantizationType.I8:
item_count = n_elems
item_type = np.int8
elif ggml_type == GGMLQuantizationType.I16:
item_count = n_elems
item_type = np.int16
elif ggml_type == GGMLQuantizationType.I32:
item_count = n_elems
item_type = np.int32
elif ggml_type == GGMLQuantizationType.I64:
item_count = n_elems
item_type = np.int64
else:
item_count = n_bytes
item_type = np.uint8
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
tensors.append(ReaderTensor(
name = tensor_name,
tensor_type = ggml_type,
shape = dims,
n_elements = n_elems,
n_bytes = n_bytes,
data_offset = data_offs,
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
field = field,
))
self.tensors = tensors