mini-omni-s2s / slam_llm /utils /dataset_utils.py
xcczach's picture
Upload 73 files
35c1cfd verified
raw
history blame
2.03 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
import importlib
from pathlib import Path
import torch
import logging
logger = logging.getLogger(__name__)
def load_module_from_py_file(py_file: str) -> object:
"""
This method loads a module from a py file which is not in the Python path
"""
module_name = Path(py_file).name
loader = importlib.machinery.SourceFileLoader(module_name, py_file)
spec = importlib.util.spec_from_loader(module_name, loader)
module = importlib.util.module_from_spec(spec)
loader.exec_module(module)
return module
def get_custom_dataset(dataset_config, tokenizer, split: str):
if ":" in dataset_config.file:
module_path, func_name = dataset_config.file.split(":")
else:
module_path, func_name = dataset_config.file, "get_custom_dataset"
if not module_path.endswith(".py"):
raise ValueError(f"Dataset file {module_path} is not a .py file.")
module_path = Path(module_path)
if not module_path.is_file():
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
module = load_module_from_py_file(module_path.as_posix())
try:
return getattr(module, func_name)(dataset_config, tokenizer, split)
except AttributeError as e:
logger.info(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
raise e
def get_preprocessed_dataset(
tokenizer, dataset_config, split: str = "train"
) -> torch.utils.data.Dataset:
def get_split():
return (
dataset_config.train_split
if split == "train"
else dataset_config.test_split
)
return get_custom_dataset(
dataset_config,
tokenizer,
get_split(),
)