uie-base / decode_utils.py

Upload 9 files

26443e3 verified 2 months ago

26.9 kB

	import logging
	import math
	import os
	import queue
	import re
	from multiprocessing import Queue
	from typing import (
	List,
	Tuple,
	Union,
	Dict,
	Any,
	Set,
	TYPE_CHECKING,
	Optional,
	Literal,
	)

	import numpy as np
	import torch
	import torch.multiprocessing as mp
	import torch.nn as nn
	from tqdm import tqdm
	from transformers import is_torch_npu_available

	if TYPE_CHECKING:
	from transformers import PreTrainedTokenizer


	os.environ["PYTHONWARNINGS"] = "ignore"
	logger = logging.getLogger("FASTIE")


	def get_id_and_prob(spans, offset_map):
	prompt_length = 0
	for i in range(1, len(offset_map)):
	if offset_map[i] != [0, 0]:
	prompt_length += 1
	else:
	break

	for i in range(1, prompt_length + 1):
	offset_map[i][0] -= (prompt_length + 1)
	offset_map[i][1] -= (prompt_length + 1)

	sentence_id = []
	prob = []
	for start, end in spans:
	prob.append(float(start[1] * end[1]))
	sentence_id.append(
	(offset_map[start[0]][0], offset_map[end[0]][1]))
	return sentence_id, prob


	def get_span(
	start_ids: Union[List[int], List[Tuple[int, float]]],
	end_ids: Union[List[int], List[Tuple[int, float]]],
	with_prob: bool = False
	) -> Set[Tuple[int, int]]:
	"""
	Get span set from position start and end list.
	Args:
	start_ids (List[int]/List[tuple]): The start index list.
	end_ids (List[int]/List[tuple]): The end index list.
	with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
	Returns:
	set: The span set without overlapping, every id can only be used once.
	"""
	if with_prob:
	start_ids = sorted(start_ids, key=lambda x: x[0])
	end_ids = sorted(end_ids, key=lambda x: x[0])
	else:
	start_ids = sorted(start_ids)
	end_ids = sorted(end_ids)

	start_pointer = 0
	end_pointer = 0
	len_start = len(start_ids)
	len_end = len(end_ids)
	couple_dict = {}

	# 将每一个span的首/尾token的id进行配对（就近匹配，默认没有overlap的情况）
	while start_pointer < len_start and end_pointer < len_end:
	if with_prob:
	start_id = start_ids[start_pointer][0]
	end_id = end_ids[end_pointer][0]
	else:
	start_id = start_ids[start_pointer]
	end_id = end_ids[end_pointer]

	if start_id == end_id:
	couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
	start_pointer += 1
	end_pointer += 1
	continue

	if start_id < end_id:
	couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
	start_pointer += 1
	continue

	if start_id > end_id:
	end_pointer += 1
	continue

	result = [(couple_dict[end], end) for end in couple_dict]
	result = set(result)
	return result


	def get_bool_ids_greater_than(
	probs: List[List[float]], limit: float = 0.5, return_prob: bool = False
	) -> List[List[int]]:
	"""
	Get idx of the last dimension in probability arrays, which is greater than a limitation.
	Args:
	probs (List[List[float]]): The input probability arrays.
	limit (float): The limitation for probability.
	return_prob (bool): Whether to return the probability
	Returns:
	List[List[int]]: The index of the last dimension meet the conditions.
	"""
	probs = np.array(probs)
	dim_len = len(probs.shape)
	if dim_len > 1:
	result = []
	for p in probs:
	result.append(get_bool_ids_greater_than(p, limit, return_prob))
	return result
	else:
	result = []
	for i, p in enumerate(probs):
	if p > limit:
	if return_prob:
	result.append((i, p))
	else:
	result.append(i)
	return result


	def dbc2sbc(s) -> str:
	rs = ""
	for char in s:
	code = ord(char)
	if code == 0x3000:
	code = 0x0020
	else:
	code -= 0xfee0
	if not (0x0021 <= code <= 0x7e):
	rs += char
	continue
	rs += chr(code)
	return rs


	def cut_chinese_sent(para: str) -> List[str]:
	"""
	Cut the Chinese sentences more precisely, reference to
	"https://blog.csdn.net/blmoistawinde/article/details/82379256".
	"""
	para = re.sub(r'([。！？\?])([^”’])', r'\1\n\2', para)
	para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)
	para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)
	para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
	para = para.rstrip()
	return para.split("\n")


	class UIEDecoder(nn.Module):

	keys_to_ignore_on_gpu = ["offset_mapping", "texts"]

	@torch.inference_mode()
	def predict(
	self,
	tokenizer: "PreTrainedTokenizer",
	texts: Union[List[str], str],
	schema: Optional[Any] = None,
	batch_size: int = 64,
	max_length: int = 512,
	split_sentence: bool = False,
	position_prob: float = 0.5,
	language: Optional[str] = "zh",
	show_progress_bar: bool = None,
	device: Optional[str] = None,
	) -> List[Any]:
	self.eval()
	self.is_english = False if language.lower() in ["zh", "zh-cn", "chinese"] else True
	if schema is not None:
	self.set_schema(schema)

	if show_progress_bar is None:
	show_progress_bar = (
	logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
	)
	# Cast an individual text to a list with length 1
	if isinstance(texts, str) or not hasattr(texts, "__len__"):
	texts = [texts]

	if device is None:
	device = next(self.parameters()).device

	self.to(device)

	return self._multi_stage_predict(
	tokenizer, texts, batch_size, max_length, split_sentence, position_prob, show_progress_bar
	)

	def set_schema(self, schema):
	if isinstance(schema, (dict, str)):
	schema = [schema]
	self._schema_tree = self._build_tree(schema)

	def _multi_stage_predict(
	self,
	tokenizer: "PreTrainedTokenizer",
	texts: List[str],
	batch_size: int = 64,
	max_length: int = 512,
	split_sentence: bool = False,
	position_prob: float = 0.5,
	show_progress_bar: bool = False,
	) -> List[Any]:
	""" Traversal the schema tree and do multi-stage prediction. """
	results = [{} for _ in range(len(texts))]
	if len(texts) < 1 or self._schema_tree is None:
	return results

	schema_list = self._schema_tree.children[:]
	while len(schema_list) > 0:
	node = schema_list.pop(0)
	examples = []
	input_map = {}
	cnt = 0
	idx = 0
	if not node.prefix:
	for data in texts:
	examples.append({"text": data, "prompt": dbc2sbc(node.name)})
	input_map[cnt] = [idx]
	idx += 1
	cnt += 1
	else:
	for pre, data in zip(node.prefix, texts):
	if len(pre) == 0:
	input_map[cnt] = []
	else:
	for p in pre:
	if self.is_english:
	if re.search(r'\[.*?\]$', node.name):
	prompt_prefix = node.name[:node.name.find("[", 1)].strip()
	cls_options = re.search(r'\[.*?\]$', node.name).group()
	# Sentiment classification of xxx [positive, negative]
	prompt = prompt_prefix + p + " " + cls_options
	else:
	prompt = node.name + p
	else:
	prompt = p + node.name
	examples.append(
	{
	"text": data,
	"prompt": dbc2sbc(prompt)
	}
	)
	input_map[cnt] = [i + idx for i in range(len(pre))]
	idx += len(pre)
	cnt += 1

	result_list = self._single_stage_predict(
	tokenizer, examples, batch_size, max_length, split_sentence, position_prob, show_progress_bar
	) if examples else []
	if not node.parent_relations:
	relations = [[] for _ in range(len(texts))]
	for k, v in input_map.items():
	for idx in v:
	if len(result_list[idx]) == 0:
	continue
	if node.name not in results[k].keys():
	results[k][node.name] = result_list[idx]
	else:
	results[k][node.name].extend(result_list[idx])
	if node.name in results[k].keys():
	relations[k].extend(results[k][node.name])
	else:
	relations = node.parent_relations
	for k, v in input_map.items():
	for i in range(len(v)):
	if len(result_list[v[i]]) == 0:
	continue
	if "relations" not in relations[k][i].keys():
	relations[k][i]["relations"] = {node.name: result_list[v[i]]}
	elif node.name not in relations[k][i]["relations"].keys():
	relations[k][i]["relations"][node.name] = result_list[v[i]]
	else:
	relations[k][i]["relations"][node.name].extend(result_list[v[i]])

	new_relations = [[] for _ in range(len(texts))]
	for i in range(len(relations)):
	for j in range(len(relations[i])):
	if "relations" in relations[i][j].keys() and node.name in relations[i][j]["relations"].keys():
	for k in range(len(relations[i][j]["relations"][node.name])):
	new_relations[i].append(relations[i][j]["relations"][node.name][k])
	relations = new_relations

	prefix = [[] for _ in range(len(texts))]
	for k, v in input_map.items():
	for idx in v:
	for i in range(len(result_list[idx])):
	if self.is_english:
	prefix[k].append(" of " + result_list[idx][i]["text"])
	else:
	prefix[k].append(result_list[idx][i]["text"] + "的")

	for child in node.children:
	child.prefix = prefix
	child.parent_relations = relations
	schema_list.append(child)

	return results

	def _convert_ids_to_results(self, examples, sentence_ids, probs):
	""" Convert ids to raw text in a single stage. """
	results = []
	for example, sentence_id, prob in zip(examples, sentence_ids, probs):
	if len(sentence_id) == 0:
	results.append([])
	continue
	result_list = []
	text = example["text"]
	prompt = example["prompt"]
	for i in range(len(sentence_id)):
	start, end = sentence_id[i]
	if start < 0 and end >= 0:
	continue
	if end < 0:
	start += len(prompt) + 1
	end += len(prompt) + 1
	result = {"text": prompt[start: end], "probability": float(prob[i])}
	else:
	result = {"text": text[start: end], "start": start, "end": end, "probability": float(prob[i])}

	result_list.append(result)
	results.append(result_list)
	return results

	def _auto_splitter(self, input_texts, max_text_len, split_sentence=False):
	"""
	Split the raw texts automatically for model inference.
	Args:
	input_texts (List[str]): input raw texts.
	max_text_len (int): cutting length.
	split_sentence (bool): If True, sentence-level split will be performed.
	return:
	short_input_texts (List[str]): the short input texts for model inference.
	input_mapping (dict): mapping between raw text and short input texts.
	"""
	input_mapping = {}
	short_input_texts = []
	cnt_short = 0
	for cnt_org, text in enumerate(input_texts):
	sens = cut_chinese_sent(text) if split_sentence else [text]
	for sen in sens:
	lens = len(sen)
	if lens <= max_text_len:
	short_input_texts.append(sen)
	if cnt_org in input_mapping:
	input_mapping[cnt_org].append(cnt_short)
	else:
	input_mapping[cnt_org] = [cnt_short]
	cnt_short += 1
	else:
	temp_text_list = [sen[i: i + max_text_len] for i in range(0, lens, max_text_len)]

	short_input_texts.extend(temp_text_list)
	short_idx = cnt_short
	cnt_short += math.ceil(lens / max_text_len)
	temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
	if cnt_org in input_mapping:
	input_mapping[cnt_org].extend(temp_text_id)
	else:
	input_mapping[cnt_org] = temp_text_id
	return short_input_texts, input_mapping

	def _single_stage_predict(
	self,
	tokenizer: "PreTrainedTokenizer",
	inputs: List[dict],
	batch_size: int = 64,
	max_length: int = 512,
	split_sentence: bool = False,
	position_prob: float = 0.5,
	show_progress_bar: bool = False,
	) -> List[Any]:
	input_texts = []
	prompts = []
	for i in range(len(inputs)):
	input_texts.append(inputs[i]["text"])
	prompts.append(inputs[i]["prompt"])
	# max predict length should exclude the length of prompt and summary tokens
	max_predict_len = max_length - len(max(prompts)) - 3

	short_input_texts, input_mapping = self._auto_splitter(
	input_texts, max_predict_len, split_sentence=split_sentence
	)

	short_texts_prompts = []
	for k, v in input_mapping.items():
	short_texts_prompts.extend([prompts[k] for _ in range(len(v))])
	short_inputs = [
	{
	"text": short_input_texts[i],
	"prompt": short_texts_prompts[i]
	}
	for i in range(len(short_input_texts))
	]

	encoded_inputs = tokenizer(
	text=short_texts_prompts,
	text_pair=short_input_texts,
	stride=2,
	truncation=True,
	max_length=512,
	padding="max_length",
	add_special_tokens=True,
	return_offsets_mapping=True,
	return_tensors="np",
	)
	offset_maps = encoded_inputs["offset_mapping"]

	start_prob_concat, end_prob_concat = [], []

	batch_iterator = tqdm(range(0, len(short_input_texts), batch_size), desc="Batches", disable=not show_progress_bar)
	for batch_start in batch_iterator:
	batch = {
	key:
	np.array(value[batch_start: batch_start + batch_size], dtype="int64")
	for key, value in encoded_inputs.items() if key not in self.keys_to_ignore_on_gpu
	}

	for k, v in batch.items():
	batch[k] = torch.tensor(v, device=self.device)

	outputs = self(**batch)
	start_prob, end_prob = outputs[0], outputs[1]
	if self.device != torch.device("cpu"):
	start_prob, end_prob = start_prob.cpu(), end_prob.cpu()
	start_prob_concat.append(start_prob.detach().numpy())
	end_prob_concat.append(end_prob.detach().numpy())

	start_prob_concat = np.concatenate(start_prob_concat)
	end_prob_concat = np.concatenate(end_prob_concat)

	start_ids_list = get_bool_ids_greater_than(start_prob_concat, limit=position_prob, return_prob=True)
	end_ids_list = get_bool_ids_greater_than(end_prob_concat, limit=position_prob, return_prob=True)

	input_ids = encoded_inputs["input_ids"].tolist()
	sentence_ids, probs = [], []
	for start_ids, end_ids, ids, offset_map in zip(start_ids_list, end_ids_list, input_ids, offset_maps):
	span_list = get_span(start_ids, end_ids, with_prob=True)
	sentence_id, prob = get_id_and_prob(span_list, offset_map.tolist())
	sentence_ids.append(sentence_id)
	probs.append(prob)

	results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
	results = self._auto_joiner(results, short_input_texts, input_mapping)
	return results

	def _auto_joiner(self, short_results, short_inputs, input_mapping):
	concat_results = []
	is_cls_task = False
	for short_result in short_results:
	if not short_result:
	continue
	elif 'start' not in short_result[0].keys() and 'end' not in short_result[0].keys():
	is_cls_task = True
	break
	else:
	break
	for k, vs in input_mapping.items():
	single_results = []
	if is_cls_task:
	cls_options = {}
	for v in vs:
	if len(short_results[v]) == 0:
	continue
	if short_results[v][0]['text'] in cls_options:
	cls_options[short_results[v][0]["text"]][0] += 1
	cls_options[short_results[v][0]["text"]][1] += short_results[v][0]["probability"]

	else:
	cls_options[short_results[v][0]["text"]] = [1, short_results[v][0]["probability"]]

	if cls_options:
	cls_res, cls_info = max(cls_options.items(), key=lambda x: x[1])
	concat_results.append(
	[
	{"text": cls_res, "probability": cls_info[1] / cls_info[0]}
	]
	)

	else:
	concat_results.append([])
	else:
	offset = 0
	for v in vs:
	if v == 0:
	single_results = short_results[v]
	offset += len(short_inputs[v])
	else:
	for i in range(len(short_results[v])):
	if "start" not in short_results[v][i] or 'end' not in short_results[v][i]:
	continue
	short_results[v][i]["start"] += offset
	short_results[v][i]["end"] += offset
	offset += len(short_inputs[v])
	single_results.extend(short_results[v])
	concat_results.append(single_results)
	return concat_results

	@classmethod
	def _build_tree(cls, schema, name="root"):
	"""
	Build the schema tree.
	"""
	schema_tree = SchemaTree(name)
	for s in schema:
	if isinstance(s, str):
	schema_tree.add_child(SchemaTree(s))
	elif isinstance(s, dict):
	for k, v in s.items():
	if isinstance(v, str):
	child = [v]
	elif isinstance(v, list):
	child = v
	else:
	raise TypeError(
	f"Invalid schema, value for each key:value pairs should be list or string"
	f"but {type(v)} received")
	schema_tree.add_child(cls._build_tree(child, name=k))
	else:
	raise TypeError(f"Invalid schema, element should be string or dict, but {type(s)} received")

	return schema_tree

	def start_multi_process_pool(self, target_devices: List[str] = None) -> Dict[
	Literal["input", "output", "processes"], Any]:
	"""启动多进程池，用多个独立进程进行预测
	如果要在多个GPU或CPU上进行预测，建议使用此方法，建议每个GPU只启动一个进程

	Args:
	target_devices (List[str], optional): PyTorch target devices, e.g. ["cuda:0", "cuda:1", ...],
	["npu:0", "npu:1", ...], or ["cpu", "cpu", "cpu", "cpu"]. If target_devices is None and CUDA/NPU
	is available, then all available CUDA/NPU devices will be used. If target_devices is None and
	CUDA/NPU is not available, then 4 CPU devices will be used.

	Returns:
	Dict[str, Any]: A dictionary with the target processes, an input queue, and an output queue.
	"""
	if target_devices is None:
	if torch.cuda.is_available():
	target_devices = ["cuda:{}".format(i) for i in range(torch.cuda.device_count())]
	elif is_torch_npu_available():
	target_devices = ["npu:{}".format(i) for i in range(torch.npu.device_count())]
	else:
	logger.info("CUDA/NPU is not available. Starting 4 CPU workers")
	target_devices = ["cpu"] * 4

	logger.info("Start multi-process pool on devices: {}".format(", ".join(map(str, target_devices))))

	self.to("cpu")
	self.share_memory()
	ctx = mp.get_context("spawn")
	input_queue = ctx.Queue()
	output_queue = ctx.Queue()
	processes = []

	for device_id in target_devices:
	p = ctx.Process(
	target=UIEDecoder._predict_multi_process_worker,
	args=(device_id, self, input_queue, output_queue),
	daemon=True,
	)
	p.start()
	processes.append(p)

	return {"input": input_queue, "output": output_queue, "processes": processes}

	@staticmethod
	def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], Any]) -> None:
	"""
	Stops all processes started with start_multi_process_pool.

	Args:
	pool (Dict[str, object]): A dictionary containing the input queue, output queue, and process list.

	Returns:
	None
	"""
	for p in pool["processes"]:
	p.terminate()

	for p in pool["processes"]:
	p.join()
	p.close()

	pool["input"].close()
	pool["output"].close()

	def predict_multi_process(
	self,
	tokenizer: "PreTrainedTokenizer",
	texts: List[str],
	pool: Dict[Literal["input", "output", "processes"], Any],
	batch_size: int = 64,
	max_length: int = 512,
	split_sentence: bool = False,
	language: Optional[str] = "zh",
	position_prob: float = 0.5,
	chunk_size: Optional[int] = None,
	) -> List[Any]:
	if chunk_size is None:
	chunk_size = min(math.ceil(len(texts) / len(pool["processes"]) / 10), 5000)

	logger.debug(f"Chunk data into {math.ceil(len(texts) / chunk_size)} packages of size {chunk_size}")

	input_queue = pool["input"]
	last_chunk_id = 0
	chunk = []

	for text in texts:
	chunk.append(text)
	if len(chunk) >= chunk_size:
	input_queue.put(
	[last_chunk_id, tokenizer, batch_size, chunk, max_length, split_sentence, language, position_prob]
	)
	last_chunk_id += 1
	chunk = []

	if len(chunk) > 0:
	input_queue.put(
	[last_chunk_id, tokenizer, batch_size, chunk, max_length, split_sentence, language, position_prob]
	)
	last_chunk_id += 1

	output_queue = pool["output"]
	results_list = sorted([output_queue.get() for _ in range(last_chunk_id)], key=lambda x: x[0])
	return sum([result[1] for result in results_list], [])

	@staticmethod
	def _predict_multi_process_worker(
	target_device: str, model: "UIEDecoder", input_queue: Queue, results_queue: Queue
	) -> None:
	"""
	Internal working process to predict in multi-process setup
	"""
	while True:
	try:
	chunk_id, tokenizer, batch_size, chunk, max_length, split_sentence, language, position_prob = (
	input_queue.get()
	)
	results = model.predict(
	tokenizer,
	chunk,
	batch_size=batch_size,
	max_length=max_length,
	split_sentence=split_sentence,
	language=language,
	show_progress_bar=False,
	device=target_device,
	)

	results_queue.put([chunk_id, results])
	except queue.Empty:
	break


	class SchemaTree(object):
	"""
	Implementation of SchemaTree
	"""

	def __init__(self, name='root', children=None):
	self.name = name
	self.children = []
	self.prefix = None
	self.parent_relations = None
	if children is not None:
	for child in children:
	self.add_child(child)

	def __repr__(self):
	return self.name

	def add_child(self, node):
	assert isinstance(
	node, SchemaTree
	), "The children of a node should be an instance of SchemaTree."
	self.children.append(node)