austindavis
/

ChessGPT_d12

@@ -1,40 +1,40 @@
 from typing import List
 import chess
-import tiktoken
 import tokenizers
 from tokenizers import models, pre_tokenizers, processors
 from torch import Tensor as TT
 from transformers import PreTrainedTokenizerFast
 from transformers.tokenization_utils_fast import BatchEncoding
-def getTiktokenizer() -> tiktoken.Encoding:
-    """
-    Defines a tiktoken-based BPE encoder for UCI chess moves. This
-    tokenizer effectively tokenizes UCI moves by the square names.
-    One notable variation is that promotions must be in upper-case.
-    Vocabulary:
-    Special Tokens (4): "\<|pad|\>", "\<|startoftext|\>", "\<|endoftext|\>", "\<|unknown|\>"
-    Square Tokens (64): a1 through h8
-    Promote Tokens (4): Q, B, R, N
-    UNUSED (8120): Need 8192-4-64-4=8120 unused tokens of the form <|unused####|>
-    """
-    special_tokens = ["<|pad|>", "<|startoftext|>", "<|endoftext|>", "<|unknown|>"]
-    unused_tokens = [f"<|unused{i:04d}" for i in range(8120)]
-    chess_vocab = special_tokens + chess.SQUARE_NAMES + list("QBRN") + unused_tokens
-    mergeable_ranks = {k.encode():v for (v,k) in enumerate(chess_vocab)}
-    chess_pat_str = r'[a-h][1-8]|[QBRN]'
-    enc = tiktoken.Encoding(
-        name="chess_enc",
-        pat_str=chess_pat_str, # or \d|\s
-        mergeable_ranks=mergeable_ranks,
-        special_tokens={k:v for (v,k) in enumerate(special_tokens)},
-    )
-    return enc
 class UciTokenizer(PreTrainedTokenizerFast):
@@ -42,7 +42,6 @@ class UciTokenizer(PreTrainedTokenizerFast):
     _UNK_TOKEN: str
     _EOS_TOKEN: str
     _BOS_TOKEN: str
     stoi: dict[str, int]
     """Integer to String mapping"""
@@ -59,11 +58,11 @@ class UciTokenizer(PreTrainedTokenizerFast):
         bos_token,
         eos_token,
         name_or_path,
-        **kwargs
     ):
         self.stoi = stoi
         self.itos = itos
         self._PAD_TOKEN = pad_token
         self._UNK_TOKEN = unk_token
         self._EOS_TOKEN = eos_token
@@ -81,8 +80,8 @@ class UciTokenizer(PreTrainedTokenizerFast):
             pair=None,
             special_tokens=[(bos_token, 1)],
         )
-        slow_tokenizer.post_processor=post_proc
         super().__init__(
             tokenizer_object=slow_tokenizer,
             unk_token=self._UNK_TOKEN,
@@ -90,7 +89,7 @@ class UciTokenizer(PreTrainedTokenizerFast):
             eos_token=self._EOS_TOKEN,
             pad_token=self._PAD_TOKEN,
             name_or_path=name_or_path,
-            **kwargs
         )
         # Override the decode behavior to ensure spaces are correctly handled
@@ -108,47 +107,48 @@ class UciTokenizer(PreTrainedTokenizerFast):
             if isinstance(token_ids, TT):
                 token_ids = token_ids.tolist()
             if isinstance(token_ids, list):
                 tokens_str = [self.itos.get(xi, self._UNK_TOKEN) for xi in token_ids]
                 processed_tokens = self._process_str_tokens(tokens_str)
                 return " ".join(processed_tokens)
-            raise ValueError(f"Unknown input type to decode() for argument 'token_ids'. Received: {type(token_ids)} ")
         self._decode = _decode
     def _init_pretokenizer(self) -> pre_tokenizers.PreTokenizer:
         raise NotImplementedError
-    def _process_str_tokens(self, tokens_str: list[str], return_player_ids: bool) -> list[str]:
         raise NotImplementedError
     def get_id2square_list() -> list[int]:
         raise NotImplementedError
 class UciTileTokenizer(UciTokenizer):
-    """ Uci tokenizer converting start/end tiles and promotion types each into individual tokens"""
-    SPECIAL_TOKENS = ["<|pad|>", "<|startoftext|>", "<|endoftext|>", "<|unknown|>"]
-    stoi = {
-        tok: idx
-        for tok, idx in list(
-            zip(SPECIAL_TOKENS + chess.SQUARE_NAMES + list("QRBN"), range(72))
-        )
-    }
-    itos = {
-        idx: tok
-        for tok, idx in list(
-            zip(SPECIAL_TOKENS + chess.SQUARE_NAMES + list("QRBN"), range(72))
-        )
-    }
-    id2square:List[int] = list(range(4,68))
     """
     List mapping token IDs to squares on the chess board. Order is file then rank, i.e.:
     `A1, B1, C1, ..., F8, G8, H8`
@@ -157,29 +157,63 @@ class UciTileTokenizer(UciTokenizer):
     def get_id2square_list(self) -> List[int]:
         return self.id2square
-    def __init__(self, **kwargs):
-           # Remove conflicting arguments from kwargs if they exist
         kwargs.pop("pad_token", None)
         kwargs.pop("unk_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("clean_up_tokenization_spaces", None)
         kwargs.pop("name_or_path", None)
         super().__init__(
             self.stoi,
             self.itos,
-            pad_token="<|pad|>",
-            unk_token="<|unknown|>",
-            bos_token="<|startoftext|>",
-            eos_token="<|endoftext|>",
             name_or_path="austindavis/uci_tile_tokenizer",
             clean_up_tokenization_spaces=False,
-            **kwargs
         )
     def _init_pretokenizer(self):
         # Pre-tokenizer to split input into UCI moves
-        pattern = tokenizers.Regex(r"\d|[QBRN]")
         pre_tokenizer = pre_tokenizers.Sequence(
             [
                 pre_tokenizers.Whitespace(),
@@ -214,16 +248,16 @@ class UciTileTokenizer(UciTokenizer):
         return moves
     @staticmethod
-    def compute_players(encoding: BatchEncoding, according_to='output'):
         """
-        Determines which player (white=True, black=False) is associated with each token in the sequence.
         This method works based on chess move sequences tokenized using the UciTileTokenizer.
         # Parameters:
         ----------
         **`encoding`** : BatchEncoding
             Tokenized input of a chess game, where each token represents a move or special token.
         **`according_to`** : str (optional, default='output')
             Specifies the perspective for associating players:
             - 'output': Returns the player whose next move is predicted by the sequence (the output move).
@@ -233,12 +267,12 @@ class UciTileTokenizer(UciTokenizer):
         -------
         List[bool]
             A list of boolean values indicating the player for each token:
-            - True for white (player 1),
             - False for black (player 2).
             The list length corresponds to the number of tokens in the sequence, including special tokens if any.
-        # Example Usage:
         ```
         >>> tok = UciTileTokenizer()
         >>> encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
@@ -246,7 +280,7 @@ class UciTileTokenizer(UciTokenizer):
         [1, 16, 32, 55, 39, 32, 39, 56, 48, 39, 48, 63, 42, 48, 56, 42, 49, 56, 65, 68]
         >>> tok.compute_players(encoding)
         [True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True, False]
-        >>> tok.compute_players(encoding, according_to='input')
         [True, True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True]
         ```
@@ -256,29 +290,30 @@ class UciTileTokenizer(UciTokenizer):
         using `according_to='output'`, it cannot reliably predict which player is
         responsible for selecting the final token of the sequence. For instance,
         if a pawn is moved to the back rank (e.g., 'e7e8'), then white must select
-        the promotion class on the next token; however, this algorithm will predict
-        that black is responsible for selecting the next token instead of white.
         """
-        return [UciTileTokenizer._compute_players_single(encoding[i].ids) for i in range(len(encoding['input_ids']))]
     @staticmethod
-    def _compute_players_single(input_ids: list[int], according_to: str='output'):
         players = [] if according_to == "output" else [True]
         current_player = False
         num_tokens_in_ply = 0
         has_specials = False
         for i, token_id in enumerate(input_ids):
             if token_id == 1:
                 has_specials = True
                 continue
             if num_tokens_in_ply == 0:
                 # check if promotion OR unknown token ID
-                if token_id > 67 or token_id == 3:
                     players.append(current_player)
                     num_tokens_in_ply = 0
                 else:
@@ -304,17 +339,26 @@ class UciTileTokenizer(UciTokenizer):
         return players if has_specials else players[1:]
 if __name__ == "__main__":
     tok = UciTileTokenizer()
-    encoding = tok('e2e4Q b7b8N e2e7 a1',add_special_tokens=True)
-    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding,  according_to='output')=}")
-    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}")
-    encoding = tok('e2e4Q b7b8N e2e7 a1',add_special_tokens=False)
-    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding,  according_to='output')=}")
-    print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}")
-    encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
-    print(encoding['input_ids'])
     print(tok.compute_players(encoding))
-    print(tok.compute_players(encoding, according_to='input'))

 from typing import List
 import chess
+# import tiktoken
 import tokenizers
 from tokenizers import models, pre_tokenizers, processors
 from torch import Tensor as TT
 from transformers import PreTrainedTokenizerFast
 from transformers.tokenization_utils_fast import BatchEncoding
+# def getTiktokenizer() -> tiktoken.Encoding:
+#     """
+#     Defines a tiktoken-based BPE encoder for UCI chess moves. This
+#     tokenizer effectively tokenizes UCI moves by the square names.
+#     One notable variation is that promotions must be in upper-case.
+#     Vocabulary:
+#     Special Tokens (4): "\<|pad|\>", "\<|startoftext|\>", "\<|endoftext|\>", "\<|unknown|\>"
+#     Square Tokens (64): a1 through h8
+#     Promote Tokens (4): Q, B, R, N
+#     UNUSED (8120): Need 8192-4-64-4=8120 unused tokens of the form <|unused####|>
+#     """
+#     special_tokens = ["<|pad|>", "<|startoftext|>", "<|endoftext|>", "<|unknown|>"]
+#     unused_tokens = [f"<|unused{i:04d}" for i in range(8120)]
+#     chess_vocab = special_tokens + chess.SQUARE_NAMES + list("QBRN") + unused_tokens
+#     mergeable_ranks = {k.encode():v for (v,k) in enumerate(chess_vocab)}
+#     chess_pat_str = r'[a-h][1-8]|[QBRN]'
+#     enc = tiktoken.Encoding(
+#         name="chess_enc",
+#         pat_str=chess_pat_str, # or \d|\s
+#         mergeable_ranks=mergeable_ranks,
+#         special_tokens={k:v for (v,k) in enumerate(special_tokens)},
+#     )
+#     return enc
 class UciTokenizer(PreTrainedTokenizerFast):
     _UNK_TOKEN: str
     _EOS_TOKEN: str
     _BOS_TOKEN: str
     stoi: dict[str, int]
     """Integer to String mapping"""
         bos_token,
         eos_token,
         name_or_path,
+        **kwargs,
     ):
         self.stoi = stoi
         self.itos = itos
         self._PAD_TOKEN = pad_token
         self._UNK_TOKEN = unk_token
         self._EOS_TOKEN = eos_token
             pair=None,
             special_tokens=[(bos_token, 1)],
         )
+        slow_tokenizer.post_processor = post_proc
         super().__init__(
             tokenizer_object=slow_tokenizer,
             unk_token=self._UNK_TOKEN,
             eos_token=self._EOS_TOKEN,
             pad_token=self._PAD_TOKEN,
             name_or_path=name_or_path,
+            **kwargs,
         )
         # Override the decode behavior to ensure spaces are correctly handled
             if isinstance(token_ids, TT):
                 token_ids = token_ids.tolist()
             if isinstance(token_ids, list):
                 tokens_str = [self.itos.get(xi, self._UNK_TOKEN) for xi in token_ids]
                 processed_tokens = self._process_str_tokens(tokens_str)
                 return " ".join(processed_tokens)
+            raise ValueError(
+                f"Unknown input type to decode() for argument 'token_ids'. Received: {type(token_ids)} "
+            )
         self._decode = _decode
     def _init_pretokenizer(self) -> pre_tokenizers.PreTokenizer:
         raise NotImplementedError
+    def _process_str_tokens(
+        self, tokens_str: list[str], return_player_ids: bool
+    ) -> list[str]:
         raise NotImplementedError
     def get_id2square_list() -> list[int]:
         raise NotImplementedError
 class UciTileTokenizer(UciTokenizer):
+    """Uci tokenizer converting start/end tiles and promotion types each into individual tokens"""
+    SPECIAL_TOKENS = (_PAD_TOKEN, _UNK_TOKEN, _BOS_TOKEN, _EOS_TOKEN) = [
+        "<|pad|>",
+        "<|startoftext|>",
+        "<|endoftext|>",
+        "<|unknown|>",
+    ]
+    stoi: dict[str, int]
+    itos: dict[int, str]
+    _split_regex: str
+    _promote_chars: str
+    id2square: List[int] = list(range(4, 68))
     """
     List mapping token IDs to squares on the chess board. Order is file then rank, i.e.:
     `A1, B1, C1, ..., F8, G8, H8`
     def get_id2square_list(self) -> List[int]:
         return self.id2square
+    def __init__(self, *, upper_promotions: bool, **kwargs):
+        # Remove conflicting arguments from kwargs if they exist
         kwargs.pop("pad_token", None)
         kwargs.pop("unk_token", None)
         kwargs.pop("bos_token", None)
         kwargs.pop("eos_token", None)
         kwargs.pop("clean_up_tokenization_spaces", None)
         kwargs.pop("name_or_path", None)
+        self.upper_promotions = upper_promotions
+        if upper_promotions:
+            self._promote_chars = "QRBN"
+            self._split_regex = r"[a-h][1-8]|[QRBN]"
+        else:
+            self._promote_chars = "qrbn"
+            self._split_regex = r"[a-h][1-8]|[qrnb]"
+        self.stoi = {
+            tok: idx
+            for tok, idx in list(
+                zip(
+                    self.SPECIAL_TOKENS
+                    + chess.SQUARE_NAMES
+                    + list(self._promote_chars),
+                    range(72),
+                )
+            )
+        }
+        self.itos = {
+            idx: tok
+            for tok, idx in list(
+                zip(
+                    self.SPECIAL_TOKENS
+                    + chess.SQUARE_NAMES
+                    + list(self._promote_chars),
+                    range(72),
+                )
+            )
+        }
         super().__init__(
             self.stoi,
             self.itos,
+            pad_token=self._PAD_TOKEN,
+            unk_token=self._UNK_TOKEN,
+            bos_token=self._BOS_TOKEN,
+            eos_token=self._EOS_TOKEN,
             name_or_path="austindavis/uci_tile_tokenizer",
             clean_up_tokenization_spaces=False,
+            **kwargs,
         )
     def _init_pretokenizer(self):
         # Pre-tokenizer to split input into UCI moves
+        pattern = tokenizers.Regex(self._split_regex)
         pre_tokenizer = pre_tokenizers.Sequence(
             [
                 pre_tokenizers.Whitespace(),
         return moves
     @staticmethod
+    def compute_players(encoding: BatchEncoding, according_to="output"):
         """
+        Determines which player (white=True, black=False) is associated with each token in the sequence.
         This method works based on chess move sequences tokenized using the UciTileTokenizer.
         # Parameters:
         ----------
         **`encoding`** : BatchEncoding
             Tokenized input of a chess game, where each token represents a move or special token.
         **`according_to`** : str (optional, default='output')
             Specifies the perspective for associating players:
             - 'output': Returns the player whose next move is predicted by the sequence (the output move).
         -------
         List[bool]
             A list of boolean values indicating the player for each token:
+            - True for white (player 1),
             - False for black (player 2).
             The list length corresponds to the number of tokens in the sequence, including special tokens if any.
+        # Example Usage:
         ```
         >>> tok = UciTileTokenizer()
         >>> encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
         [1, 16, 32, 55, 39, 32, 39, 56, 48, 39, 48, 63, 42, 48, 56, 42, 49, 56, 65, 68]
         >>> tok.compute_players(encoding)
         [True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True, False]
+        >>> tok.compute_players(encoding, according_to='input')
         [True, True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True]
         ```
         using `according_to='output'`, it cannot reliably predict which player is
         responsible for selecting the final token of the sequence. For instance,
         if a pawn is moved to the back rank (e.g., 'e7e8'), then white must select
+        the promotion class on the next token; however, this algorithm will predict
+        that black is responsible for selecting the next token instead of white.
         """
+        return [
+            UciTileTokenizer._compute_players_single(encoding[i].ids)
+            for i in range(len(encoding["input_ids"]))
+        ]
     @staticmethod
+    def _compute_players_single(input_ids: list[int], according_to: str = "output"):
         players = [] if according_to == "output" else [True]
         current_player = False
         num_tokens_in_ply = 0
         has_specials = False
         for i, token_id in enumerate(input_ids):
             if token_id == 1:
                 has_specials = True
                 continue
             if num_tokens_in_ply == 0:
                 # check if promotion OR unknown token ID
+                if token_id > 67 or token_id == 3:
                     players.append(current_player)
                     num_tokens_in_ply = 0
                 else:
         return players if has_specials else players[1:]
 if __name__ == "__main__":
     tok = UciTileTokenizer()
+    encoding = tok("e2e4Q b7b8N e2e7 a1", add_special_tokens=True)
+    print(
+        f"{encoding['input_ids']=}\n{tok.compute_players(encoding,  according_to='output')=}"
+    )
+    print(
+        f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}"
+    )
+    encoding = tok("e2e4Q b7b8N e2e7 a1", add_special_tokens=False)
+    print(
+        f"{encoding['input_ids']=}\n{tok.compute_players(encoding,  according_to='output')=}"
+    )
+    print(
+        f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}"
+    )
+    encoding = tok("e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q")
+    print(encoding["input_ids"])
     print(tok.compute_players(encoding))
+    print(tok.compute_players(encoding, according_to="input"))