ejbejaranos
/

BitNet3-8B-Converted

Model card Files Files and versions Community

BitNet3-8B-Converted / tokenizer_utils.py

ejbejaranos's picture

Upload folder using huggingface_hub

4d061f7 verified 3 months ago

history blame contribute delete

1.38 kB

	from typing import Dict, Any
	from transformers import PreTrainedTokenizer

	# Define la función de tokenización
	def tokenize(element: Dict[str, Any], tokenizer: PreTrainedTokenizer, context_length: int) -> Dict[str, Any]:
	"""
	Tokeniza un elemento de texto usando el tokenizador especificado.

	Args:
	element (Dict[str, Any]): Un diccionario con la clave "text" que contiene el texto a tokenizar.
	tokenizer (PreTrainedTokenizer): Un tokenizador de Hugging Face usado para tokenizar el texto.
	context_length (int): Tamaño máximo de los fragmentos de texto.

	Returns:
	Dict[str, Any]: Un diccionario que contiene los ids de los tokens en la clave "input_ids".
	"""
	outputs = tokenizer(
	element["text"],
	truncation=False,
	max_length=context_length,
	return_overflowing_tokens=True,
	return_length=True,
	)

	# Combinar todos los tokens de los documentos tokenizados
	combined = []
	for tokenized_doc in outputs['input_ids']:
	combined.extend(tokenized_doc + [tokenizer.eos_token_id]) # Usar extend() para mayor eficiencia

	# Dividir el texto tokenizado en segmentos del tamaño de context_length
	input_batch = [
	combined[i:i + context_length] for i in range(0, len(combined) - context_length, context_length)
	]

	return {"input_ids": input_batch}