Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

App Files Files Community

Multipurpose-AI-Agent-Development / recursive_token_chunker.py

devve1

Create recursive_token_chunker.py

a5dc975 verified 26 days ago

raw

history blame

No virus

18.7 kB


	# This script is adapted from the LangChain package, developed by LangChain AI.
	# Original code can be found at: https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/character.py
	# License: MIT License

	from typing import Any, List, Optional
	from base_chunker import BaseChunker
	from fixed_token_chunker import TextSplitter
	import re

	class Language(str, Enum):
	"""Enum of the programming languages."""

	CPP = "cpp"
	GO = "go"
	JAVA = "java"
	KOTLIN = "kotlin"
	JS = "js"
	TS = "ts"
	PHP = "php"
	PROTO = "proto"
	PYTHON = "python"
	RST = "rst"
	RUBY = "ruby"
	RUST = "rust"
	SCALA = "scala"
	SWIFT = "swift"
	MARKDOWN = "markdown"
	LATEX = "latex"
	HTML = "html"
	SOL = "sol"
	CSHARP = "csharp"
	COBOL = "cobol"
	C = "c"
	LUA = "lua"
	PERL = "perl"

	def _split_text_with_regex(
	text: str, separator: str, keep_separator: bool
	) -> List[str]:
	# Now that we have the separator, split the text
	if separator:
	if keep_separator:
	# The parentheses in the pattern keep the delimiters in the result.
	_splits = re.split(f"({separator})", text)
	splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
	if len(_splits) % 2 == 0:
	splits += _splits[-1:]
	splits = [_splits[0]] + splits
	else:
	splits = re.split(separator, text)
	else:
	splits = list(text)
	return [s for s in splits if s != ""]

	class RecursiveTokenChunker(TextSplitter):
	"""Splitting text by recursively look at characters.

	Recursively tries to split by different characters to find one
	that works.
	"""

	def __init__(
	self,
	chunk_size: int = 4000,
	chunk_overlap: int = 200,
	separators: Optional[List[str]] = None,
	keep_separator: bool = True,
	is_separator_regex: bool = False,
	**kwargs: Any,
	) -> None:
	"""Create a new TextSplitter."""
	super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, keep_separator=keep_separator, **kwargs)
	self._separators = separators or ["\n\n", "\n", ".", "?", "!", " ", ""]
	self._is_separator_regex = is_separator_regex

	def _split_text(self, text: str, separators: List[str]) -> List[str]:
	"""Split incoming text and return chunks."""
	final_chunks = []
	# Get appropriate separator to use
	separator = separators[-1]
	new_separators = []
	for i, _s in enumerate(separators):
	_separator = _s if self._is_separator_regex else re.escape(_s)
	if _s == "":
	separator = _s
	break
	if re.search(_separator, text):
	separator = _s
	new_separators = separators[i + 1 :]
	break

	_separator = separator if self._is_separator_regex else re.escape(separator)
	splits = _split_text_with_regex(text, _separator, self._keep_separator)

	# Now go merging things, recursively splitting longer texts.
	_good_splits = []
	_separator = "" if self._keep_separator else separator
	for s in splits:
	if self._length_function(s) < self._chunk_size:
	_good_splits.append(s)
	else:
	if _good_splits:
	merged_text = self._merge_splits(_good_splits, _separator)
	final_chunks.extend(merged_text)
	_good_splits = []
	if not new_separators:
	final_chunks.append(s)
	else:
	other_info = self._split_text(s, new_separators)
	final_chunks.extend(other_info)
	if _good_splits:
	merged_text = self._merge_splits(_good_splits, _separator)
	final_chunks.extend(merged_text)
	return final_chunks

	def split_text(self, text: str) -> List[str]:
	return self._split_text(text, self._separators)

	# @classmethod
	# def from_language(
	# cls, language: Language, **kwargs: Any
	# ) -> RecursiveCharacterTextSplitter:
	# separators = cls.get_separators_for_language(language)
	# return cls(separators=separators, is_separator_regex=True, **kwargs)

	@staticmethod
	def get_separators_for_language(language: Language) -> List[str]:
	if language == Language.CPP:
	return [
	# Split along class definitions
	"\nclass ",
	# Split along function definitions
	"\nvoid ",
	"\nint ",
	"\nfloat ",
	"\ndouble ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\nswitch ",
	"\ncase ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.GO:
	return [
	# Split along function definitions
	"\nfunc ",
	"\nvar ",
	"\nconst ",
	"\ntype ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nswitch ",
	"\ncase ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.JAVA:
	return [
	# Split along class definitions
	"\nclass ",
	# Split along method definitions
	"\npublic ",
	"\nprotected ",
	"\nprivate ",
	"\nstatic ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\nswitch ",
	"\ncase ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.KOTLIN:
	return [
	# Split along class definitions
	"\nclass ",
	# Split along method definitions
	"\npublic ",
	"\nprotected ",
	"\nprivate ",
	"\ninternal ",
	"\ncompanion ",
	"\nfun ",
	"\nval ",
	"\nvar ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\nwhen ",
	"\ncase ",
	"\nelse ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.JS:
	return [
	# Split along function definitions
	"\nfunction ",
	"\nconst ",
	"\nlet ",
	"\nvar ",
	"\nclass ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\nswitch ",
	"\ncase ",
	"\ndefault ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.TS:
	return [
	"\nenum ",
	"\ninterface ",
	"\nnamespace ",
	"\ntype ",
	# Split along class definitions
	"\nclass ",
	# Split along function definitions
	"\nfunction ",
	"\nconst ",
	"\nlet ",
	"\nvar ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\nswitch ",
	"\ncase ",
	"\ndefault ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.PHP:
	return [
	# Split along function definitions
	"\nfunction ",
	# Split along class definitions
	"\nclass ",
	# Split along control flow statements
	"\nif ",
	"\nforeach ",
	"\nwhile ",
	"\ndo ",
	"\nswitch ",
	"\ncase ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.PROTO:
	return [
	# Split along message definitions
	"\nmessage ",
	# Split along service definitions
	"\nservice ",
	# Split along enum definitions
	"\nenum ",
	# Split along option definitions
	"\noption ",
	# Split along import statements
	"\nimport ",
	# Split along syntax declarations
	"\nsyntax ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.PYTHON:
	return [
	# First, try to split along class definitions
	"\nclass ",
	"\ndef ",
	"\n\tdef ",
	# Now split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.RST:
	return [
	# Split along section titles
	"\n=+\n",
	"\n-+\n",
	"\n\\*+\n",
	# Split along directive markers
	"\n\n.. *\n\n",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.RUBY:
	return [
	# Split along method definitions
	"\ndef ",
	"\nclass ",
	# Split along control flow statements
	"\nif ",
	"\nunless ",
	"\nwhile ",
	"\nfor ",
	"\ndo ",
	"\nbegin ",
	"\nrescue ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.RUST:
	return [
	# Split along function definitions
	"\nfn ",
	"\nconst ",
	"\nlet ",
	# Split along control flow statements
	"\nif ",
	"\nwhile ",
	"\nfor ",
	"\nloop ",
	"\nmatch ",
	"\nconst ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.SCALA:
	return [
	# Split along class definitions
	"\nclass ",
	"\nobject ",
	# Split along method definitions
	"\ndef ",
	"\nval ",
	"\nvar ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\nmatch ",
	"\ncase ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.SWIFT:
	return [
	# Split along function definitions
	"\nfunc ",
	# Split along class definitions
	"\nclass ",
	"\nstruct ",
	"\nenum ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\ndo ",
	"\nswitch ",
	"\ncase ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.MARKDOWN:
	return [
	# First, try to split along Markdown headings (starting with level 2)
	"\n#{1,6} ",
	# Note the alternative syntax for headings (below) is not handled here
	# Heading level 2
	# ---------------
	# End of code block
	"```\n",
	# Horizontal lines
	"\n\\\\\\*+\n",
	"\n---+\n",
	"\n___+\n",
	# Note that this splitter doesn't handle horizontal lines defined
	# by three or more of ***, ---, or ___, but this is not handled
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.LATEX:
	return [
	# First, try to split along Latex sections
	"\n\\\\chapter{",
	"\n\\\\section{",
	"\n\\\\subsection{",
	"\n\\\\subsubsection{",
	# Now split by environments
	"\n\\\\begin{enumerate}",
	"\n\\\\begin{itemize}",
	"\n\\\\begin{description}",
	"\n\\\\begin{list}",
	"\n\\\\begin{quote}",
	"\n\\\\begin{quotation}",
	"\n\\\\begin{verse}",
	"\n\\\\begin{verbatim}",
	# Now split by math environments
	"\n\\\begin{align}",
	"$$",
	"$",
	# Now split by the normal type of lines
	" ",
	"",
	]
	elif language == Language.HTML:
	return [
	# First, try to split along HTML tags
	"<body",
	"<div",
	"<p",
	"<br",
	"<li",
	"<h1",
	"<h2",
	"<h3",
	"<h4",
	"<h5",
	"<h6",
	"<span",
	"<table",
	"<tr",
	"<td",
	"<th",
	"<ul",
	"<ol",
	"<header",
	"<footer",
	"<nav",
	# Head
	"<head",
	"<style",
	"<script",
	"<meta",
	"<title",
	"",
	]
	elif language == Language.CSHARP:
	return [
	"\ninterface ",
	"\nenum ",
	"\nimplements ",
	"\ndelegate ",
	"\nevent ",
	# Split along class definitions
	"\nclass ",
	"\nabstract ",
	# Split along method definitions
	"\npublic ",
	"\nprotected ",
	"\nprivate ",
	"\nstatic ",
	"\nreturn ",
	# Split along control flow statements
	"\nif ",
	"\ncontinue ",
	"\nfor ",
	"\nforeach ",
	"\nwhile ",
	"\nswitch ",
	"\nbreak ",
	"\ncase ",
	"\nelse ",
	# Split by exceptions
	"\ntry ",
	"\nthrow ",
	"\nfinally ",
	"\ncatch ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.SOL:
	return [
	# Split along compiler information definitions
	"\npragma ",
	"\nusing ",
	# Split along contract definitions
	"\ncontract ",
	"\ninterface ",
	"\nlibrary ",
	# Split along method definitions
	"\nconstructor ",
	"\ntype ",
	"\nfunction ",
	"\nevent ",
	"\nmodifier ",
	"\nerror ",
	"\nstruct ",
	"\nenum ",
	# Split along control flow statements
	"\nif ",
	"\nfor ",
	"\nwhile ",
	"\ndo while ",
	"\nassembly ",
	# Split by the normal type of lines
	"\n\n",
	"\n",
	" ",
	"",
	]
	elif language == Language.COBOL:
	return [
	# Split along divisions
	"\nIDENTIFICATION DIVISION.",
	"\nENVIRONMENT DIVISION.",
	"\nDATA DIVISION.",
	"\nPROCEDURE DIVISION.",
	# Split along sections within DATA DIVISION
	"\nWORKING-STORAGE SECTION.",
	"\nLINKAGE SECTION.",
	"\nFILE SECTION.",
	# Split along sections within PROCEDURE DIVISION
	"\nINPUT-OUTPUT SECTION.",
	# Split along paragraphs and common statements
	"\nOPEN ",
	"\nCLOSE ",
	"\nREAD ",
	"\nWRITE ",
	"\nIF ",
	"\nELSE ",
	"\nMOVE ",
	"\nPERFORM ",
	"\nUNTIL ",
	"\nVARYING ",
	"\nACCEPT ",
	"\nDISPLAY ",
	"\nSTOP RUN.",
	# Split by the normal type of lines
	"\n",
	" ",
	"",
	]

	else:
	raise ValueError(
	f"Language {language} is not supported! "
	f"Please choose from {list(Language)}"
	)