added model files

Browse files

Files changed (7) hide show

enigma/EnBERT.py +181 -0
enigma/TrainEnigma.ipynb +470 -0
enigma/config_enigma.json +13 -0
enigma/enigma.cpp +364 -0
enigma/generate.py +126 -0
enigma/model.py +388 -0
enigma/run.py +100 -0

enigma/EnBERT.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+  this isn't a bert based model, i just liked the name and named it
+  --> decoder-only model, uses RMS normalization and GELU activation function
+  --> one masked-attention and other unmasked
+  --> attention layers have relational positional-embeddings
+"""
+import json
+with open('config.json', 'r', encoding='utf-8') as file:
+  params = json.load(file)
+# required parameters
+block_size = params['block_size']
+d_model = params['d_model']
+n_head = params['n_heads']
+n_layers = params['n_layers']
+learning_rate = params['learning_rate']
+dropout = params['dropout']
+norm_eps = params['norm_eps']
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+class RMSNorm(nn.Module):
+  def __init__(self, dim: int, eps: float = 1e-6):
+    super().__init__()
+    self.eps = eps
+    self.weight = nn.Parameter(torch.ones(dim))
+  def _norm(self, x):
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+  def forward(self, x):
+    output = self._norm(x.float()).type_as(x)
+    return output * self.weight
+class SingleHead(nn.Module):
+  def __init__(self,
+      head_size: int,
+      d_model: int,
+      block_size: int,
+      dropout: float):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=True)
+    self.query = nn.Linear(d_model, head_size, bias=True)
+    self.value = nn.Linear(d_model, head_size, bias=True)
+    self.dropout = nn.Dropout(dropout)
+    self.rel_pos_embd = nn.Parameter(torch.randn(block_size, block_size, head_size))
+    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+  def forward(self, x: torch.Tensor, mask: bool= False):
+    B, T, C = x.shape
+    key = self.key(x)
+    query = self.query(x)
+    scores = torch.matmul(query ,key.transpose(-2, -1)) / (key.shape[-1]**-0.5)
+    if mask is True:
+      scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_embd[:T, :T])
+    scores = scores + rel_pos_scores
+    att_mat = F.softmax(scores, dim=-1)
+    att_mat = self.dropout(att_mat)
+    value = self.value(x)
+    output = torch.matmul(att_mat, value)
+    return output
+class MultiHeadAttention(nn.Module):
+  def __init__(self,
+      d_model: int,
+      block_size: int,
+      n_head : int,
+      dropout: float):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([SingleHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])
+    self.projection = nn.Linear(d_model, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x: torch.Tensor, mask: bool):
+    out = torch.cat([h(x, mask) for h in self.heads], dim=-1)
+    out = self.dropout(self.projection(out))
+    return out
+class FeedForward(nn.Module):
+  def __init__(self, d_model, dropout):
+    super().__init__()
+    self.net = nn.Sequential(
+      nn.Linear(d_model, 5 * d_model),
+      nn.GELU(),
+      nn.Linear(5 * d_model, d_model),
+      nn.Dropout(dropout),
+      )
+  def forward(self, x: torch.Tensor):
+    return self.net(x)
+class DecoderBlock(nn.Module):
+  def __init__(self, d_model: int,
+        block_size: int,
+        n_head: int,
+        norm_eps: float,
+        dropout: float):
+    super().__init__()
+    self.self_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.ffwd = FeedForward(d_model, dropout)
+    self.dropout = nn.Dropout(dropout)
+    self.norm = RMSNorm(d_model, eps=norm_eps)
+  def forward(self, x: torch.Tensor):
+    x_out = self.self_att(self.norm(x), mask=True)
+    x_out = x + self.dropout(x_out)
+    del x
+    x = self.self_att(self.norm(x_out, mask=False))
+    x = x_out + self.dropout(x)
+    del x_out
+    x_out = self.ffwd(self.norm(x))
+    x_out = x + self.dropout(x_out)
+    del x
+    return x_out
+class Transformer(nn.Module):
+  def __init__(self, vocab_size: int):
+    super().__init__()
+    self.block_size = block_size
+    self.token_embeddings = nn.Embedding(vocab_size, d_model)
+    self.decoder = nn.Sequential(*[DecoderBlock(n_head=n_head, d_model=d_model, dropout=dropout, norm_eps=norm_eps, block_size=block_size) for _ in range(n_layers)])
+    self.norm_final = RMSNorm(d_model, eps=norm_eps)
+    self.linear_final = nn.Linear(d_model, vocab_size)
+    self.dropout = nn.Dropout(dropout)
+    self.apply(self._init_weights)
+  def _init_weights(self, module):
+    if isinstance(module, nn.Linear):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+      if module.bias is not None:
+        torch.nn.init.zeros_(module.bias.data)
+    elif isinstance(module, nn.Embedding):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+  def forward(self, idx, targets=None):
+    B, T = idx.shape
+    x = self.token_embeddings(idx)
+    x = self.decoder(x)
+    logits = self.linear_final(self.norm_final(x))
+    if targets is None:
+      loss = None
+    else:
+      B, T, C = logits.shape
+      logits = logits.view(B*T, C)
+      targets = targets.view(B*T)
+      loss = F.cross_entropy(logits, targets)
+    return logits, loss
+  @torch.no_grad()
+  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+    self.eval()
+    for _ in range(max_new_tokens):
+      idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
+      logits, _ = self(idx_cond)
+      logits = logits[:, -1, :] / temperature
+      if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        logits[logits < v[:, [-1]]] = -float('Inf')
+      probs = F.softmax(logits, dim=-1)
+      idx_next = torch.multinomial(probs, num_samples=1)
+      idx = torch.cat((idx, idx_next), dim=1)
+      return idx

enigma/TrainEnigma.ipynb ADDED Viewed

	@@ -0,0 +1,470 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vXIGN6PAuZWg"
+      },
+      "source": [
+        "### Train file for enigma model\n",
+        "\n",
+        "- Contains K-mer tokenizer, k=4, can be changed though\n",
+        "- Train data is available on huggingface repo: [hf/engima-1.5b](https://huggingface.co/shivendrra/enigma-1.5b)\n",
+        "- For now, trainig decoder-based model only\n",
+        "- More about this on github repo: [github/enigma-1.5b](https://github.com/shivendrra/enigma-1.5b)\n",
+        "- Saves model after training in '.pth' & '.safetensors' file for later use\n",
+        "- Generate function works fine"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WXpJBLyr30Rx"
+      },
+      "outputs": [],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "r7WUm0VL4bN4"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "\n",
+        "# importing the data\n",
+        "file_path = '/content/drive/MyDrive/consolidated_dna.txt'\n",
+        "with open(file_path, 'r', encoding='utf-8') as file:\n",
+        "  dna_seq = file.read()\n",
+        "file.close()\n",
+        "\n",
+        "print(f\"{(len(dna_seq)/1e6):.2f} million letters\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Cdhybhz9owTK"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from tqdm import tqdm\n",
+        "import json\n",
+        "\n",
+        "class KMerTokenizer:\n",
+        "  def __init__(self, k_mers: int=4):\n",
+        "    self.k_mers = k_mers\n",
+        "    self.vocab = {}\n",
+        "    self.id_to_token = []\n",
+        "    self.token_to_id = {}\n",
+        "\n",
+        "  def tokenize_sequence(self, sequence):\n",
+        "    kmers = [sequence[i:i+self.k_mers] for i in tqdm(range(0, len(sequence), self.k_mers), desc=\"tokenizing k-mers\")]\n",
+        "    return kmers\n",
+        "\n",
+        "  def build_vocab(self, sequences):\n",
+        "    all_kmers = []\n",
+        "    for sequence in sequences:\n",
+        "      all_kmers.extend(self.tokenize_sequence(sequence))\n",
+        "    token_count = {}\n",
+        "    for kmer in all_kmers:\n",
+        "      if kmer in token_count:\n",
+        "        token_count[kmer] += 1\n",
+        "      else:\n",
+        "        token_count[kmer] = 1\n",
+        "    sorted_tokens = sorted(token_count.items(), key=lambda x: x[1], reverse=True)\n",
+        "    for token, _ in sorted_tokens:\n",
+        "      self.token_to_id[token] = len(self.token_to_id)\n",
+        "      self.id_to_token.append(token)\n",
+        "    self.vocab = self.token_to_id\n",
+        "\n",
+        "  def encode(self, sequence):\n",
+        "    encoded_sequence = []\n",
+        "    kmers = self.tokenize_sequence(sequence)\n",
+        "    for kmer in tqdm(kmers, desc=\"encoding sequences\"):\n",
+        "      if kmer in self.token_to_id:\n",
+        "        encoded_sequence.append(self.token_to_id[kmer])\n",
+        "      else:\n",
+        "        encoded_sequence.append(len(self.vocab))\n",
+        "    return encoded_sequence\n",
+        "\n",
+        "  def decode(self, encoded_sequence):\n",
+        "    decoded_sequence = [self.id_to_token[token_id] for token_id in encoded_sequence]\n",
+        "    return decoded_sequence\n",
+        "\n",
+        "  def save_model(self, model_path):\n",
+        "    vocab_file = f\"{model_path}/base_{self.k_mers}k.json\"\n",
+        "    with open(vocab_file, 'w') as f:\n",
+        "      json.dump(self.vocab, f)\n",
+        "\n",
+        "  def load_model(self, path):\n",
+        "    assert path.endswith('.json')\n",
+        "    with open(path, 'r') as f:\n",
+        "      vocab = json.load(f)\n",
+        "\n",
+        "    self.vocab = vocab\n",
+        "    self.token_to_id = self.vocab\n",
+        "    self.vocab_size = len(vocab)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6BCpjdi5rjU4"
+      },
+      "outputs": [],
+      "source": [
+        "token = KMerTokenizer()\n",
+        "token.build_vocab([dna_seq])\n",
+        "print(f\"vocab size: {len(token.vocab)}\")\n",
+        "print(token.id_to_token[:10])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6Ou9txgmAdIB"
+      },
+      "outputs": [],
+      "source": [
+        "# Train and test splits\n",
+        "data = torch.tensor(token.encode(dna_seq), dtype=torch.long)\n",
+        "print(f\"{(len(data)/1e6):0f} million\"\")\n",
+        "n = int(0.9*len(data)) # first 90% will be train, rest val\n",
+        "train_data = data[:n]\n",
+        "val_data = data[n:]\n",
+        "print(f\"train data {(len(train_data)/1e6):.0f}million, val data {(len(val_data)/1e6):.0f}million\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ebFKQQ9NAq4e"
+      },
+      "outputs": [],
+      "source": [
+        "# hyperparams\n",
+        "batch_size = 10\n",
+        "block_size = 256\n",
+        "max_iters = 5000\n",
+        "eval_interval = 100\n",
+        "learning_rate = 3e-5\n",
+        "eval_iters = 100\n",
+        "d_model = 512\n",
+        "n_layers = 12\n",
+        "n_head = 18\n",
+        "dropout = 0.25\n",
+        "norm_eps = 1e-5"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dZMiYkr37cmU"
+      },
+      "outputs": [],
+      "source": [
+        "import torch.nn as nn\n",
+        "from torch.nn import functional as F\n",
+        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "class RMSNorm(nn.Module):\n",
+        "  def __init__(self, dim: int, eps: float = 1e-6):\n",
+        "    super().__init__()\n",
+        "    self.eps = eps\n",
+        "    self.weight = nn.Parameter(torch.ones(dim))\n",
+        "\n",
+        "  def _norm(self, x):\n",
+        "    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)\n",
+        "\n",
+        "  def forward(self, x):\n",
+        "    output = self._norm(x.float()).type_as(x)\n",
+        "    return output * self.weight\n",
+        "\n",
+        "class SingleHead(nn.Module):\n",
+        "  def __init__(self,\n",
+        "      head_size: int,\n",
+        "      d_model: int,\n",
+        "      block_size: int,\n",
+        "      dropout: float):\n",
+        "    super().__init__()\n",
+        "    self.key = nn.Linear(d_model, head_size, bias=True)\n",
+        "    self.query = nn.Linear(d_model, head_size, bias=True)\n",
+        "    self.value = nn.Linear(d_model, head_size, bias=True)\n",
+        "    self.dropout = nn.Dropout(dropout)\n",
+        "    self.rel_pos_embd = nn.Parameter(torch.randn(block_size, block_size, head_size))\n",
+        "    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
+        "\n",
+        "  def forward(self, x: torch.Tensor, mask: bool= False):\n",
+        "    B, T, C = x.shape\n",
+        "    key = self.key(x)\n",
+        "    query = self.query(x)\n",
+        "    scores = torch.matmul(query ,key.transpose(-2, -1)) / (key.shape[-1]**-0.5)\n",
+        "\n",
+        "    if mask is True:\n",
+        "      scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))\n",
+        "\n",
+        "    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_embd[:T, :T])\n",
+        "    scores = scores + rel_pos_scores\n",
+        "\n",
+        "    att_mat = F.softmax(scores, dim=-1)\n",
+        "    att_mat = self.dropout(att_mat)\n",
+        "    value = self.value(x)\n",
+        "    output = torch.matmul(att_mat, value)\n",
+        "    return output\n",
+        "\n",
+        "class MultiHeadAttention(nn.Module):\n",
+        "  def __init__(self,\n",
+        "      d_model: int,\n",
+        "      block_size: int,\n",
+        "      n_head : int,\n",
+        "      dropout: float):\n",
+        "    head_size = d_model // n_head\n",
+        "    super().__init__()\n",
+        "    self.heads = nn.ModuleList([SingleHead(d_model=d_model, dropout=dropout, block_size=block_size, head_size=head_size) for _ in range(n_head)])\n",
+        "    self.projection = nn.Linear(d_model, d_model)\n",
+        "    self.dropout = nn.Dropout(dropout)\n",
+        "\n",
+        "  def forward(self, x: torch.Tensor, mask: bool):\n",
+        "    out = torch.cat([h(x, mask) for h in self.heads], dim=-1)\n",
+        "    out = self.dropout(self.projection(out))\n",
+        "    return out\n",
+        "\n",
+        "class FeedForward(nn.Module):\n",
+        "  def __init__(self, d_model, dropout):\n",
+        "    super().__init__()\n",
+        "    self.net = nn.Sequential(\n",
+        "      nn.Linear(d_model, 5 * d_model),\n",
+        "      nn.GELU(),\n",
+        "      nn.Linear(5 * d_model, d_model),\n",
+        "      nn.Dropout(dropout),\n",
+        "      )\n",
+        "\n",
+        "  def forward(self, x: torch.Tensor):\n",
+        "    return self.net(x)\n",
+        "\n",
+        "class DecoderBlock(nn.Module):\n",
+        "  def __init__(self, d_model: int,\n",
+        "        block_size: int,\n",
+        "        n_head: int,\n",
+        "        norm_eps: float,\n",
+        "        dropout: float):\n",
+        "    super().__init__()\n",
+        "    self.self_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)\n",
+        "    self.ffwd = FeedForward(d_model, dropout)\n",
+        "    self.dropout = nn.Dropout(dropout)\n",
+        "    self.norm = RMSNorm(d_model, eps=norm_eps)\n",
+        "\n",
+        "  def forward(self, x: torch.Tensor):\n",
+        "    x_out = self.self_att(self.norm(x), mask=True)\n",
+        "    x_out = x + self.dropout(x_out)\n",
+        "    del x\n",
+        "\n",
+        "    x = self.self_att(self.norm(x_out, mask=False))\n",
+        "    x = x_out + self.dropout(x)\n",
+        "    del x_out\n",
+        "\n",
+        "    x_out = self.ffwd(self.norm(x))\n",
+        "    x_out = x + self.dropout(x_out)\n",
+        "    del x\n",
+        "\n",
+        "    return x_out\n",
+        "\n",
+        "class Transformer(nn.Module):\n",
+        "  def __init__(self, vocab_size: int):\n",
+        "    super().__init__()\n",
+        "    self.block_size = block_size\n",
+        "    self.token_embeddings = nn.Embedding(vocab_size, d_model)\n",
+        "    self.decoder = nn.Sequential(*[DecoderBlock(n_head=n_head, d_model=d_model, dropout=dropout, norm_eps=norm_eps, block_size=block_size) for _ in range(n_layers)])\n",
+        "    self.norm_final = RMSNorm(d_model, eps=norm_eps)\n",
+        "    self.linear_final = nn.Linear(d_model, vocab_size)\n",
+        "    self.dropout = nn.Dropout(dropout)\n",
+        "    self.apply(self._init_weights)\n",
+        "\n",
+        "  def _init_weights(self, module):\n",
+        "    if isinstance(module, nn.Linear):\n",
+        "      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
+        "      if module.bias is not None:\n",
+        "        torch.nn.init.zeros_(module.bias.data)\n",
+        "    elif isinstance(module, nn.Embedding):\n",
+        "      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n",
+        "\n",
+        "  def forward(self, idx, targets=None):\n",
+        "    B, T = idx.shape\n",
+        "    x = self.token_embeddings(idx)\n",
+        "    x = self.decoder(x)\n",
+        "    logits = self.linear_final(self.norm_final(x))\n",
+        "\n",
+        "    if targets is None:\n",
+        "      loss = None\n",
+        "\n",
+        "    else:\n",
+        "      B, T, C = logits.shape\n",
+        "      logits = logits.view(B*T, C)\n",
+        "      targets = targets.view(B*T)\n",
+        "      loss = F.cross_entropy(logits, targets)\n",
+        "\n",
+        "    return logits, loss\n",
+        "\n",
+        "  @torch.no_grad()\n",
+        "  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):\n",
+        "    self.eval()\n",
+        "    for _ in range(max_new_tokens):\n",
+        "\n",
+        "      idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]\n",
+        "      logits, _ = self(idx_cond)\n",
+        "      logits = logits[:, -1, :] / temperature\n",
+        "\n",
+        "      if top_k is not None:\n",
+        "        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n",
+        "        logits[logits < v[:, [-1]]] = -float('Inf')\n",
+        "\n",
+        "      probs = F.softmax(logits, dim=-1)\n",
+        "      idx_next = torch.multinomial(probs, num_samples=1)\n",
+        "      idx = torch.cat((idx, idx_next), dim=1)\n",
+        "\n",
+        "      return idx"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "X9VOBZFr7g3W"
+      },
+      "outputs": [],
+      "source": [
+        "import timeit\n",
+        "start_time = timeit.default_timer()\n",
+        "\n",
+        "def get_batch(split):\n",
+        "  data = train_data if split == 'train' else val_data\n",
+        "  ix = torch.randint(len(data) - block_size, (batch_size,))\n",
+        "  x = torch.stack([data[i:i+block_size] for i in ix])\n",
+        "  y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
+        "  x, y = x.to(device), y.to(device)\n",
+        "  return x, y\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def estimate_loss():\n",
+        "  out = {}\n",
+        "  model.eval()\n",
+        "  for split in ['train', 'val']:\n",
+        "    losses = torch.zeros(eval_iters)\n",
+        "    for k in range(eval_iters):\n",
+        "      X, Y = get_batch(split)\n",
+        "      logits, loss = model(X, Y)\n",
+        "      losses[k] = loss.item()\n",
+        "    out[split] = losses.mean()\n",
+        "  model.train()\n",
+        "  return out\n",
+        "\n",
+        "vocab_size = len(token.vocab)\n",
+        "model = Transformer(vocab_size)\n",
+        "# checkpoint_path = '/content/drive/MyDrive/enigma-2.5b.pth'\n",
+        "# checkpoint = torch.load(checkpoint_path)\n",
+        "# model.load_state_dict(checkpoint)\n",
+        "m = model.to(device)\n",
+        "\n",
+        "# no of parameters\n",
+        "n_param = sum(p.numel() for p in m.parameters())/1e6\n",
+        "print(f\"{n_param:.1f} million parameters\")\n",
+        "\n",
+        "# optimizer\n",
+        "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
+        "steps = []\n",
+        "train_losses = []\n",
+        "val_losses = []\n",
+        "\n",
+        "for iter in range(max_iters):\n",
+        "\n",
+        "  if iter % eval_interval == 0 or iter == max_iters - 1:\n",
+        "    losses = estimate_loss()\n",
+        "    print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
+        "\n",
+        "    steps.append(iter)\n",
+        "    train_losses.append(losses['train'])\n",
+        "    val_losses.append(losses['val'])\n",
+        "\n",
+        "  xb, yb = get_batch('train')\n",
+        "  logits, loss = model(xb, yb)\n",
+        "  optimizer.zero_grad(set_to_none=True)\n",
+        "  loss.backward()\n",
+        "  optimizer.step()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tzJMKoA35uIV"
+      },
+      "outputs": [],
+      "source": [
+        "end_time = timeit.default_timer()\n",
+        "print(f\"total parameters: {n_param:.1f} billion\")\n",
+        "print(f\"trained in {((end_time - start_time)/3600):.2f}hrs\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eB47Yn9aNrrO"
+      },
+      "outputs": [],
+      "source": [
+        "model_save_name = f'consolidated_00.pth'\n",
+        "path = f\"/content/drive/MyDrive/{model_save_name}\"\n",
+        "torch.save(model.state_dict(), path)\n",
+        "\n",
+        "# saving safe-tensors\n",
+        "from safetensors.torch import save_file\n",
+        "\n",
+        "model_save_name = f'consolidated_00.safetensors'\n",
+        "path = f\"/content/drive/MyDrive/{model_save_name}\"\n",
+        "save_file(model.state_dict(), path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "89TNah_89CRB"
+      },
+      "outputs": [],
+      "source": [
+        "!nvidia-smi"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

enigma/config_enigma.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "batch_size": 10,
+  "block_size": 512,
+  "max_iters": 5000,
+  "eval_interval": 50,
+  "learning_rate": 3e-5,
+  "eval_iters": 100,
+  "d_model": 384,
+  "n_head": 12,
+  "n_layer": 12,
+  "dropout": 0.2,
+  "norm_eps": 1e-5
+}

enigma/enigma.cpp ADDED Viewed

	@@ -0,0 +1,364 @@

+#include <torch/torch.h>
+#include <iostream>
+#include <vector>
+// Define device
+torch::Device device(torch::kCUDA);
+// Define constants
+const int batch_size = 8;
+const int block_size = 32;
+const int max_iters = 1000;
+const int eval_interval = 50;
+const int eval_iters = 5;
+const int d_model = 256;
+const int n_layer = 16;
+const int n_head = 12;
+const float dropout = 0.2;
+const float norm_eps = 1e-5;
+const int vocab_size = 5;
+// sample data
+torch::Tensor train_data = torch::rand({1000, block_size});
+torch::Tensor val_data = torch::rand({500, block_size});
+// Data loading function
+std::pair<torch::Tensor, torch::Tensor> get_batch(const std::string& split) {
+    torch::Tensor data = (split == "train") ? train_data : val_data;
+    torch::Tensor ix = torch::randint(data.size(0) - block_size, {batch_size});
+    torch::Tensor x = torch::empty({batch_size, block_size});
+    torch::Tensor y = torch::empty({batch_size, block_size});
+    for (int i = 0; i < batch_size; ++i) {
+        x[i] = data.index({ix[i], ix[i] + block_size});
+        y[i] = data.index({ix[i] + 1, ix[i] + block_size + 1});
+    }
+    return std::make_pair(x.to(device), y.to(device));
+}
+// Custom classes and functions
+class SWiGLU : public torch::nn::Module {
+public:
+    SWiGLU() {}
+    torch::Tensor forward(torch::Tensor x) {
+        torch::Tensor sigmoid_output = torch::sigmoid(x);
+        torch::Tensor relu_output = torch::relu(x);
+        torch::Tensor out = sigmoid_output * relu_output + (1 - sigmoid_output) * x;
+        return out;
+    }
+};
+class UnMaskedHeadImpl : public torch::nn::Module {
+public:
+    UnMaskedHeadImpl(int d_model, int head_size, float dropout)
+        : key(register_module("key", torch::nn::Linear(d_model, head_size))),
+          query(register_module("query", torch::nn::Linear(d_model, head_size))),
+          value(register_module("value", torch::nn::Linear(d_model, head_size))),
+          dropout(torch::nn::Dropout(dropout)) {
+        register_module("dropout", dropout);
+    }
+    torch::Tensor forward(torch::Tensor x) {
+        torch::Tensor key_out = key->forward(x);
+        torch::Tensor query_out = query->forward(x);
+        torch::Tensor weights = query_out.matmul(key_out.transpose(-2, -1)) * std::sqrt(key_out.size(-1));
+        weights = torch::softmax(weights, -1);
+        weights = dropout(weights);
+        torch::Tensor value_out = value->forward(x);
+        torch::Tensor out = weights.matmul(value_out);
+        return out;
+    }
+private:
+    torch::nn::Linear key, query, value;
+    torch::nn::Dropout dropout;
+};
+TORCH_MODULE(UnMaskedHead);
+class MaskedHeadImpl : public torch::nn::Module {
+public:
+    MaskedHeadImpl(int head_size, float dropout, int d_model)
+        : key(register_module("key", torch::nn::Linear(d_model, head_size))),
+          query(register_module("query", torch::nn::Linear(d_model, head_size))),
+          value(register_module("value", torch::nn::Linear(d_model, head_size))),
+          dropout(torch::nn::Dropout(dropout)) {
+        register_buffer("tril", torch::tril(torch::ones(block_size, block_size)));
+    }
+    torch::Tensor forward(torch::Tensor x) {
+        torch::Tensor key_out = key->forward(x);
+        torch::Tensor query_out = query->forward(x);
+        torch::Tensor weights = query_out.matmul(key_out.transpose(-2, -1)) * std::sqrt(key_out.size(-1));
+        weights = weights.masked_fill(tril[:x.size(1), :x.size(1)] == 0, std::numeric_limits<float>::lowest());
+        weights = torch::softmax(weights, -1);
+        weights = dropout(weights);
+        torch::Tensor value_out = value->forward(x);
+        torch::Tensor out = weights.matmul(value_out);
+        return out;
+    }
+private:
+    torch::nn::Linear key, query, value;
+    torch::nn::Dropout dropout;
+    torch::Tensor tril;
+};
+TORCH_MODULE(MaskedHead);
+class MultiUnMaskedImpl : public torch::nn::Module {
+public:
+    MultiUnMaskedImpl(int d_model, int n_head, float dropout)
+        : proj(register_module("proj", torch::nn::Linear(n_head * (d_model / n_head), d_model))),
+          dropout(torch::nn::Dropout(dropout)) {
+        for (int i = 0; i < n_head; ++i) {
+            heads.push_back(register_module("head" + std::to_string(i), UnMaskedHead(d_model, d_model / n_head, dropout)));
+        }
+    }
+    torch::Tensor forward(torch::Tensor x) {
+        std::vector<torch::Tensor> head_outputs;
+        for (auto& head : heads) {
+            head_outputs.push_back(head->forward(x));
+        }
+        torch::Tensor out = torch::cat(head_outputs, -1);
+        out = dropout(out);
+        out = proj(out);
+        return out;
+    }
+private:
+    torch::nn::Linear proj;
+    torch::nn::Dropout dropout;
+    std::vector<UnMaskedHead> heads;
+};
+TORCH_MODULE(MultiUnMasked);
+class MultiMaskedImpl : public torch::nn::Module {
+public:
+    MultiMaskedImpl(int d_model, int n_head, float dropout)
+        : proj(register_module("proj", torch::nn::Linear(n_head * (d_model / n_head), d_model))),
+          dropout(torch::nn::Dropout(dropout)) {
+        for (int i = 0; i < n_head; ++i) {
+            heads.push_back(register_module("head" + std::to_string(i), MaskedHead(d_model, d_model / n_head, dropout)));
+        }
+    }
+    torch::Tensor forward(torch::Tensor x) {
+        std::vector<torch::Tensor> head_outputs;
+        for (auto& head : heads) {
+            head_outputs.push_back(head->forward(x));
+        }
+        torch::Tensor out = torch::cat(head_outputs, -1);
+        out = dropout(out);
+        out = proj(out);
+        return out;
+    }
+private:
+    torch::nn::Linear proj;
+    torch::nn::Dropout dropout;
+    std::vector<MaskedHead> heads;
+};
+TORCH_MODULE(MultiMasked);
+class FeedForwardImpl : public torch::nn::Module {
+public:
+    FeedForwardImpl(int d_model, float dropout)
+        : net(register_module("net", torch::nn::Sequential(
+            torch::nn::Linear(d_model, 4 * d_model),
+            torch::nn::GELU(),
+            torch::nn::Linear(4 * d_model, d_model),
+            torch::nn::Dropout(dropout)
+        ))) {}
+    torch::Tensor forward(torch::Tensor x) {
+        return net->forward(x);
+    }
+private:
+    torch::nn::Sequential net;
+};
+TORCH_MODULE(FeedForward);
+class BlockImpl : public torch::nn::Module {
+public:
+    BlockImpl(int d_model, int n_head, float norm_eps, float dropout)
+        : sa_masked(MultiMasked(d_model, n_head, dropout)),
+          sa_unmasked(MultiUnMasked(d_model, n_head, dropout)),
+          ffwd(FeedForward(d_model, dropout)),
+          norm1(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))),
+          norm2(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))) {}
+    torch::Tensor forward(torch::Tensor x) {
+        torch::Tensor x2 = x + sa_unmasked->forward(norm1->forward(x));
+        x = x2 + ffwd->forward(norm2->forward(x2));
+        x2 = x + sa_masked->forward(norm1->forward(x));
+        x = x2 + ffwd->forward(norm2->forward(x2));
+        return x;
+    }
+private:
+    MultiMasked sa_masked;
+    MultiUnMasked sa_unmasked;
+    FeedForward ffwd;
+    torch::nn::LayerNorm norm1, norm2;
+};
+TORCH_MODULE(Block);
+class EnigmaImpl : public torch::nn::Module {
+public:
+    EnigmaImpl(int vocab_size, int block_size, int d_model, int n_layer, int n_head, float dropout, float norm_eps)
+        : toked_model(register_module("toked_model", torch::nn::Embedding(vocab_size, d_model))),
+          pos_encod(register_module("pos_encod", torch::nn::Embedding(block_size, d_model))),
+          norm_final(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))),
+          linear_final(register_module("linear_final", torch::nn::Linear(d_model, vocab_size))) {
+        for (int i = 0; i < n_layer; ++i) {
+            block_layers.push_back(register_module("block" + std::to_string(i), Block(d_model, n_head, norm_eps, dropout)));
+        }
+        register_buffer("block_size", torch::tensor(block_size));
+        _init_weights(this);
+    }
+    void _init_weights(torch::nn::Module* module) {
+        auto parameters = module->named_parameters();
+        for (auto& param : parameters) {
+            if (param.key().find("weight") != std::string::npos) {
+                torch::nn::init::normal_(param.value(), 0.0, 0.02);
+            } else if (param.key().find("bias") != std::string::npos) {
+                torch::nn::init::zeros_(param.value());
+            }
+        }
+    }
+    std::pair<torch::Tensor, torch::Tensor> forward(torch::Tensor idx, torch::Tensor targets=torch::Tensor()) {
+        torch::Tensor toked_model_out = toked_model->forward(idx);
+        torch::Tensor pos_encod_out = pos_encod->forward(torch::arange(idx.size(1)));
+        torch::Tensor x = toked_model_out + pos_encod_out;
+        for (auto& block : block_layers) {
+            x = block->forward(x);
+        }
+        torch::Tensor logits = linear_final->forward(norm_final->forward(x));
+        if (!targets.numel()) {
+            return {logits, torch::Tensor()};
+        } else {
+            logits = logits.view({-1, logits.size(-1)});
+            targets = targets.view({-1});
+            torch::Tensor loss = torch::nn::functional::cross_entropy(logits, targets);
+            return {logits, loss};
+        }
+    }
+    std::vector<std::vector<std::pair<torch::Tensor, float>>> complex_generate(torch::Tensor idx, int max_new_tokens, float temperature=1.0, int top_k=3, int beam_width=5) {
+        std::vector<std::vector<std::pair<torch::Tensor, float>>> completed_beams;
+        torch::Tensor current_idx = idx.clone();
+        std::vector<std::pair<torch::Tensor, float>> beam = {std::make_pair(current_idx, 0.0)};
+        for (int i = 0; i < max_new_tokens; ++i) {
+            std::vector<std::pair<torch::Tensor, float>> new_beam;
+            for (auto& beam_item : beam) {
+                torch::Tensor& current_idx = beam_item.first;
+                torch::Tensor logits, loss;
+                std::tie(logits, loss) = forward(current_idx);
+                logits = logits.index({torch::indexing::Slice(), -1}); // Get last token predictions
+                // Apply softmax and temperature
+                torch::Tensor probs = torch::nn::functional::softmax(logits / temperature, -1);
+                // Top-k sampling
+                if (top_k > 0) {
+                    probs = top_k_filtering(probs, top_k);
+                }
+                // Sample from the distribution
+                torch::Tensor sampled_idx = torch::multinomial(probs, beam_width, true);
+                for (int j = 0; j < beam_width; ++j) {
+                    torch::Tensor new_idx = torch::cat({current_idx, sampled_idx.index({torch::indexing::Slice(), j})}, 1);
+                    torch::Tensor new_log_prob = beam_item.second + torch::log(probs.index({torch::indexing::Slice(), sampled_idx.index({torch::indexing::Slice(), j})}));
+                    new_beam.push_back(std::make_pair(new_idx, new_log_prob.item()));
+                }
+            }
+            // Sort new beam by log probabilities
+            std::sort(new_beam.begin(), new_beam.end(), [](const std::pair<torch::Tensor, float>& a, const std::pair<torch::Tensor, float>& b) {
+                return a.second > b.second;
+            });
+            // Only keep top beams
+            beam = std::vector<std::pair<torch::Tensor, float>>(new_beam.begin(), new_beam.begin() + beam_width);
+        }
+        completed_beams.push_back(beam);
+        return completed_beams;
+    }
+    std::vector<std::vector<std::pair<torch::Tensor, float>>> top_k_filtering(torch::Tensor logits, int top_k) {
+        torch::Tensor top_values, top_indices;
+        std::tie(top_values, top_indices) = torch::topk(logits, top_k, -1);
+        torch::Tensor min_value = torch::index_select(top_values, -1, torch::tensor({top_k-1}));
+        torch::Tensor filtered_logits = torch::where(logits < min_value, torch::full_like(logits, -std::numeric_limits<float>::infinity()), logits);
+        return filtered_logits;
+    }
+private:
+    torch::nn::Embedding toked_model, pos_encod;
+    std::vector<Block> block_layers;
+    torch::nn::LayerNorm norm_final;
+    torch::nn::Linear linear_final;
+    int block_size;
+};
+TORCH_MODULE(Enigma);
+int main() {
+    // Set seed
+    torch::manual_seed(1400);
+    // Create model
+    Enigma model(vocab_size, block_size, d_model, n_layer, n_head, dropout, norm_eps);
+    model->to(device);
+    // Define optimizer
+    torch::optim::AdamW optimizer(model->parameters(), torch::optim::AdamWOptions(learning_rate));
+    // Training loop
+    std::vector<float> train_losses, val_losses;
+    for (int iter = 0; iter < max_iters; ++iter) {
+        if (iter % eval_interval == 0 || iter == max_iters - 1) {
+            // Evaluate and print losses
+            auto losses = estimate_loss();
+            std::cout << "step " << iter << ": train loss " << losses["train"] << ", val loss " << losses["val"] << std::endl;
+            // Save losses for plotting
+            train_losses.push_back(losses["train"]);
+            val_losses.push_back(losses["val"]);
+        }
+        // Get batch, forward pass, loss calculation, backward pass, optimizer step
+        auto [xb, yb] = get_batch("train");
+        torch::Tensor logits, loss;
+        std::tie(logits, loss) = model->forward(xb, yb);
+        optimizer.zero_grad();
+        loss.backward();
+        optimizer.step();
+    }
+    return 0;
+}

enigma/generate.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+current_directory = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_directory)
+with open('../parquet files/new_dna.txt', 'r', encoding='utf-8') as file:
+  captions = file.read()
+print(f"{(len(captions)/1e6):.2f} million letters")
+from tokenizer import PerCharTokenizer
+tokenizer = PerCharTokenizer()
+vocab_size = tokenizer.vocab_size
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+from model import Transformer
+model = Transformer(vocab_size=vocab_size)
+class Generator(Transformer):
+  def __init__(self, vocab_size):
+    super().__init__()
+    self.vocab_size = vocab_size
+    self.block_size = Transformer.block_size
+  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):
+    """
+      generate new tokens using the trained model
+    Args:
+      - idx (Tensor): input tensor representing initial token indices
+      - max_new_tokens (int): max no of new tokens to generate
+      - temperature (float): softmax temperature for sampling
+      - top_k (int): no of top tokens to consider in sampling
+    Returns:
+      - generated_tokens (list): list of generated token indices
+    """
+    generated_tokens = []
+    for _ in range(max_new_tokens):
+      idx_cond = idx[:, -self.block_size:]
+      logits, _ = self(idx_cond)
+      logits = logits[:, -1, :]
+      scaled_logits = logits / temperature
+      if top_k > 0:
+        scaled_logits = self._top_k_filtering(scaled_logits, top_k)
+      probs = F.softmax(scaled_logits, dim=-1)
+      sampled_idx = torch.multinomial(probs, num_samples=1)
+      generated_tokens.append(sampled_idx.item())
+      idx = torch.cat((idx, sampled_idx), dim=1)
+    return generated_tokens
+  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):
+    """
+      Generate predictions for masked tokens using the trained model.
+      Args:
+        - idx (Tensor): input tensor representing token indices
+        - masked_indices (Tensor): tensor of indices indicating masked positions
+        - temperature (float): softmax temperature for sampling
+        - top_k (int): no of top tokens to consider in sampling
+      Returns:
+        - predicted_tokens (Tensor): tensor of predicted token indices
+    """
+    B, T = idx.shape
+    toked_model = self.toked_model(idx)
+    pos_encod = self.pos_encod(torch.arange(T, device=device))
+    x = toked_model + pos_encod
+    for layer in self.enc_layer:
+      x_out = layer(x)
+    for layer in self.dec_layer:
+      x_final = layer(x, x_out)
+    x_masked = x_final.clone()
+    x_masked[masked_indices] = self.toked_model(torch.tensor([6], device=device))
+    x_masked = self.norm_final(x_masked)
+    logits = self.linear_final(x_masked)
+    masked_logits = logits[masked_indices].view(-1, logits.size(-1))
+    scaled_logits = masked_logits / temperature
+    if top_k > 0:
+      scaled_logits = self._top_k_filtering(scaled_logits, top_k)
+    probs = F.softmax(scaled_logits, dim=-1)
+    predicted_indices = torch.argmax(probs, dim=-1)
+    return predicted_indices
+  def _top_k_filtering(self, logits, top_k):
+    """
+      filter logits to keep only the top-k tokens
+    Args:
+      - logits (Tensor): input tensor representing unscaled logits
+      - top_k (int): no of top tokens to keep
+    Returns:
+      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining
+    """
+    values, indices = torch.topk(logits, top_k, dim=-1)
+    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)
+    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)
+    return filtered_logits
+checkpoint_path = '../trained models/enigma_47m.pth'
+checkpoint = torch.load(checkpoint_path)
+model.load_state_dict(checkpoint)
+m = model.to(device)
+target_text = "AGTTCTGCGAT"
+context = torch.tensor([tokenizer.encode(target_text)], dtype=torch.long, device=device)
+generated_output = tokenizer.decode(Generator.generate(context, max_new_tokens=10, temperature=0.5, top_k=5))
+print(f"{target_text}{generated_output}")

enigma/model.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""
+  transformer based model, but with few minimal tweaks
+  trained a 2.5billion parameters model with current set configurations
+"""
+import torch
+import json
+import os
+current_directory = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_directory)
+import torch.nn as nn
+from torch.nn import functional as F
+with open('config_enigma.json', 'r', encoding='utf-8') as file:
+  params = json.load(file)
+batch_size = params['batch_size']
+block_size = params['block_size']
+n_head = params['n_head']
+d_model = params['d_model']
+n_layers = params['n_layer']
+dropout = params['dropout']
+norm_eps = params['norm_eps']
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+class AttentionHead(nn.Module):
+  """
+    initialize a single head of self attention.
+    Args:
+    - d_model (int): dimensionality of the model's hidden layers
+    - head_size (int): dimensionality of each attention head
+    - dropout (float): dropout probability
+    - block_size (int): the maximum sequence length for positional encoding
+  """
+  def __init__(self, d_model, head_size, dropout, block_size):
+    super().__init__()
+    self.key = nn.Linear(d_model, head_size, bias=True)
+    self.query = nn.Linear(d_model, head_size, bias=True)
+    self.value = nn.Linear(d_model, head_size, bias=False)
+    self.dropout = nn.Dropout(dropout)
+    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+    self.rel_pos_emb = nn.Parameter(torch.randn(block_size, block_size, head_size))
+  def forward(self, x, mask=False):
+    """
+    forward pass of a single attention head.
+    Args:
+      - x (Tensor): input tensor.
+      - mask (bool): flag indicating whether to apply masking
+    Returns:
+      - out (Tensor): output tensor after self attention
+    """
+    B, T, C = x.shape
+    key = self.key(x)
+    query = self.query(x)
+    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.shape[-1] ** -0.5)
+    rel_pos_scores = torch.einsum('btc,tvc->btv', query, self.rel_pos_emb[:T, :T])
+    scores += rel_pos_scores
+    if mask:
+      scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+    weights = F.softmax(scores, dim=-1)
+    weights = self.dropout(weights)
+    value = self.value(x)
+    out = torch.matmul(weights, value)
+    return out
+class MultiHeadAttention(nn.Module):
+  """
+    initialize a multi-head attention module.
+    Args:
+    - d_model (int): dimensionality of the model's hidden layers
+    - n_head (int): no of attention heads
+    - dropout (float): dropout probability
+    - block_size (int): context length
+  """
+  def __init__(self, d_model, n_head, dropout, block_size):
+    head_size = d_model // n_head
+    super().__init__()
+    self.heads = nn.ModuleList([AttentionHead(d_model=d_model, dropout=dropout, head_size=head_size, block_size=block_size) for _ in range(n_head)])
+    self.proj = nn.Linear(n_head * head_size, d_model)
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x, mask):
+    """
+    forward pass of the multi-head attention module
+    Args:
+      - x (Tensor): input tensor
+      - mask (bool): flag indicating whether to apply masking
+    Returns:
+      - out (Tensor): output tensor after multi-head attention
+    """
+    out = torch.cat([h(x, mask=mask) for h in self.heads], dim=-1)
+    out = self.dropout(self.proj(out))
+    return out
+class FeedForward(nn.Module):
+  """
+    initialize a feedforward network module
+    Args:
+    - d_model (int): the dimensionality of the model's hidden layers
+    - dropout (float): dropout probability
+  """
+  def __init__(self, d_model, dropout):
+    super().__init__()
+    self.net = nn.Sequential(
+      nn.Linear(d_model, 10*d_model),
+      nn.GELU(),
+      nn.Linear(10*d_model, d_model),
+      nn.Dropout(dropout)
+    )
+  def forward(self, x):
+    """
+    forward pass of the feedforward network module
+    Args:
+      - x (Tensor): input tensor
+    Returns:
+      - out (Tensor): output tensor after passing through the feedforward network
+    """
+    return self.net(x)
+class EncoderNetwork(nn.Module):
+  """
+    initialize an encoder network module
+    Args:
+    - d_model (int): dimensionality of the model's hidden layers
+    - n_head (int): no of attention heads in multi-head attention layers
+    - norm_eps (float): epsilon value for layer normalization
+    - dropout (float): dropout probability
+    - block_size (int): the maximum sequence length for positional encoding
+    """
+  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):
+    super().__init__()
+    self.s_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.ffwd = FeedForward(d_model, dropout)
+    self.dropout = nn.Dropout(dropout)
+    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)
+    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)
+  def forward(self, src):
+    """
+      forward pass of the encoder network module.
+      Args:
+      - src (Tensor): input tensor representing source data
+      Returns:
+      - src (Tensor): output tensor after passing through the encoder network
+    """
+    src2 = self.s_att(src, mask=False)
+    src = src + self.dropout(src2)
+    src = self.norm1(src)
+    src2 = self.ffwd(src)
+    src = src + self.dropout(src2)
+    src = self.norm2(src)
+    return src
+class DecoderNetwork(nn.Module):
+  """
+    initialize a decoder network module
+    Args:
+      - d_model (int): dimensionality of the model's hidden layers
+      - n_head (int): no of attention heads in multi-head attention layers
+      - norm_eps (float): epsilon value for layer normalization
+      - dropout (float): dropout probability
+      - block_size (int): the maximum sequence length for positional encoding
+  """
+  def __init__(self, d_model, n_head, norm_eps, dropout, block_size):
+    super().__init__()
+    self.s_att = MultiHeadAttention(n_head=n_head, d_model=d_model, dropout=dropout, block_size=block_size)
+    self.ffwd = FeedForward(d_model, dropout)
+    self.dropout = nn.Dropout(dropout)
+    self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)
+    self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)
+  def forward(self, src, att):
+    """
+      forward pass of the decoder network module.
+      Args:
+        - src (Tensor): input tensor, same as the encoder's inputs
+        - trg (Tensor): encoder's attention matrix
+      Returns:
+        - src_f (Tensor): final output tensor
+    """
+    src2 = self.s_att(src, mask=True)
+    src = src + self.dropout(src2)
+    src = src + self.norm1(src)
+    att = src + att
+    att2 = self.s_att(att, mask=False)
+    att2 = att + self.dropout(att2)
+    trg = att2 + self.norm1(att2)
+    src_f2 = self.ffwd(self.norm2(trg))
+    src_f = src_f + self.dropout(src_f2)
+    src_f = self.norm2(src_f)
+    return src_f
+class Transformer(nn.Module):
+  """
+    initialize a Transformer model
+    Args:
+      - vocab_size (int): size of the vocabulary
+      - d_model (int): dimensionality of the model's hidden layers
+      - block_size (int): maximum sequence length for positional encoding/context length
+      - n_layers (int): number of encoder and decoder layers in the Transformer
+      - n_head (int): number of attention heads in multi-head attention layers
+      - norm_eps (float): epsilon value for layer normalization
+      - dropout (float): dropout probability
+  """
+  def __init__(self, vocab_size):
+    super().__init__()
+    self.block_size = block_size
+    self.toked_model = nn.Embedding(vocab_size, d_model)
+    self.pos_encod = nn.Embedding(block_size, d_model)
+    self.enc_layer = nn.ModuleList([EncoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])
+    self.dec_layer = nn.ModuleList([DecoderNetwork(n_head=n_head, norm_eps=norm_eps, block_size=block_size, dropout=dropout, d_model=d_model) for _ in range(n_layers)])
+    self.norm_final = nn.LayerNorm(d_model)
+    self.linear_final = nn.Linear(d_model, vocab_size)
+    self.dropout = nn.Dropout(dropout)
+    self.apply(self._init_weights)
+  def _init_weights(self, module):
+    """
+      initialize weights of linear and embedding layers
+      Args:
+        - module (nn.Module): the module to initialize weights for
+    """
+    if isinstance(module, nn.Linear):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+      if module.bias is not None:
+        torch.nn.init.zeros_(module.bias.data)
+    elif isinstance(module, nn.Embedding):
+      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+  def forward(self, idx, targets=None):
+    """
+      forward pass of the transformer model
+    Args:
+      - idx (Tensor): input tensor representing token indices
+      - targets (Tensor): target tensor for computing loss during training
+    Returns:
+      - logits (Tensor): output logits from the final linear layer
+      - loss (Tensor): optional. computed cross-entropy loss if targets are provided, else None
+    """
+    B, T = idx.shape
+    toked_model = self.toked_model(idx)
+    pos_encod = self.pos_encod(torch.arange(T, device=device))
+    x = toked_model + pos_encod
+    for layer in self.enc_layer:
+      x_out = layer(x)
+    for layer in self.dec_layer:
+      x_final = layer(x, x_out)
+    x_final = self.norm_final(x_final)
+    logits = self.linear_final(x_final)
+    if targets is None:
+      loss = None
+    else:
+      B, T, C = logits.shape
+      logits = logits.view(B*T, C)
+      targets = targets.view(B*T)
+      loss = F.cross_entropy(logits, targets)
+    return logits, loss
+  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=0):
+    """
+      generate new tokens using the trained model
+    Args:
+      - idx (Tensor): input tensor representing initial token indices
+      - max_new_tokens (int): max no of new tokens to generate
+      - temperature (float): softmax temperature for sampling
+      - top_k (int): no of top tokens to consider in sampling
+    Returns:
+      - generated_tokens (list): list of generated token indices
+    """
+    generated_tokens = []
+    for _ in range(max_new_tokens):
+      idx_cond = idx[:, -self.block_size:]
+      logits, _ = self(idx_cond)
+      logits = logits[:, -1, :]
+      scaled_logits = logits / temperature
+      if top_k > 0:
+        scaled_logits = self._top_k_filtering(scaled_logits, top_k)
+      probs = F.softmax(scaled_logits, dim=-1)
+      sampled_idx = torch.multinomial(probs, num_samples=1)
+      generated_tokens.append(sampled_idx.item())
+      idx = torch.cat((idx, sampled_idx), dim=1)
+    return generated_tokens
+  def generate_masked_tokens(self, idx, masked_indices, temperature=1.0, top_k=0):
+    """
+      Generate predictions for masked tokens using the trained model.
+      Args:
+        - idx (Tensor): input tensor representing token indices
+        - masked_indices (Tensor): tensor of indices indicating masked positions
+        - temperature (float): softmax temperature for sampling
+        - top_k (int): no of top tokens to consider in sampling
+      Returns:
+        - predicted_tokens (Tensor): tensor of predicted token indices
+    """
+    B, T = idx.shape
+    toked_model = self.toked_model(idx)
+    pos_encod = self.pos_encod(torch.arange(T, device=device))
+    x = toked_model + pos_encod
+    for layer in self.enc_layer:
+      x_out = layer(x)
+    for layer in self.dec_layer:
+      x_final = layer(x, x_out)
+    x_masked = x_final.clone()
+    x_masked[masked_indices] = self.toked_model(torch.tensor([6], device=device))
+    x_masked = self.norm_final(x_masked)
+    logits = self.linear_final(x_masked)
+    masked_logits = logits[masked_indices].view(-1, logits.size(-1))
+    scaled_logits = masked_logits / temperature
+    if top_k > 0:
+      scaled_logits = self._top_k_filtering(scaled_logits, top_k)
+    probs = F.softmax(scaled_logits, dim=-1)
+    predicted_indices = torch.argmax(probs, dim=-1)
+    return predicted_indices
+  def _top_k_filtering(self, logits, top_k):
+    """
+      filter logits to keep only the top-k tokens
+    Args:
+      - logits (Tensor): input tensor representing unscaled logits
+      - top_k (int): no of top tokens to keep
+    Returns:
+      - filtered_logits (Tensor): filtered logits with only top-k tokens remaining
+    """
+    values, indices = torch.topk(logits, top_k, dim=-1)
+    min_value = values[:, -1].unsqueeze(-1).expand_as(logits)
+    filtered_logits = torch.where(logits < min_value, torch.ones_like(logits) * -float('inf'), logits)
+    return filtered_logits

enigma/run.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+  use this file to train the model
+  working:
+    - imports vatious dependencies first, and then loads the training data
+    - tokenizes it, per-character basis
+    - loads the required hyper-parameters and the model file
+    - trains it till 'max_iters' and saves the model state, and generates outputs
+  with the current set configuration, model can reach upto ~60million parameters
+  and can become ~99% accurate with next token prediction
+"""
+import torch
+import json
+import os
+current_directory = os.path.dirname(os.path.abspath(__file__))
+os.chdir(current_directory)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+with open('../parquet files/new_dna.txt', 'r', encoding='utf-8') as file:
+  captions = file.read()
+print(f"{(len(captions)/1e6):.2f} million letters")
+from ..tokenizer import PerCharTokenizer
+tokenizer = PerCharTokenizer()
+vocab_size = tokenizer.vocab_size
+# Train and test splits
+data = torch.tensor(tokenizer.encode(captions), dtype=torch.long)
+n = int(0.9*len(data)) # first 90% will be train, rest val
+train_data = data[:n]
+val_data = data[n:]
+with open('/config_enigma.json', 'r', encoding='utf-8') as file:
+  params = json.load(file)
+# required parameters
+batch_size = params['batch_size']
+block_size = params['block_size']
+max_iters = params['max_iters']
+eval_interval = params['eval_interval']
+eval_iters = params['eval_iters']
+learning_rate = params['learning_rate']
+torch.manual_seed(1400)
+# data loading
+def get_batch(split):
+    # generate a small batch of data of inputs x and targets y
+    data = train_data if split == 'train' else val_data
+    ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i:i+block_size] for i in ix])
+    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
+    x, y = x.to(device), y.to(device)
+    return x, y
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ['train', 'val']:
+        losses = torch.zeros(eval_iters)
+        for k in range(eval_iters):
+            X, Y = get_batch(split)
+            logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+from model import Transformer
+model = Transformer(vocab_size=vocab_size)
+m = model.to(device)
+# no of parameters
+n_param = sum(p.numel() for p in m.parameters())/1e6
+print(f"{n_param:.2f} million")
+optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+steps = []
+train_losses = []
+val_losses = []
+for iter in range(max_iters):
+  if iter % eval_interval == 0 or iter == max_iters - 1:
+    losses = estimate_loss()
+    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+    steps.append(iter)
+    train_losses.append(losses['train'])
+    val_losses.append(losses['val'])
+  xb, yb = get_batch('train')
+  logits, loss = model(xb, yb)
+  optimizer.zero_grad(set_to_none=True)
+  loss.backward()
+  optimizer.step()
+torch.save(model.state_dict(), f'enigma_{n_param:.0f}m.pth')