Spaces:

gradio
/

gpt-neo

Runtime error

App Files Files Community

aliabd commited on Jun 25, 2021

Commit

c6e7238

1 Parent(s): c528e7b

full working demo

Browse files

Files changed (43) hide show

.idea/gpt-neo.iml +8 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/modules.xml +8 -0
CODEOWNERS +1 -0
Dockerfile +15 -0
GPTNeo_example_notebook.ipynb +0 -0
LICENSE +21 -0
app.py +12 -0
configs.py +47 -0
configs/dataset_configs/example.json +8 -0
configs/dataset_configs/openwebtext2_new_inputs.json +9 -0
configs/dataset_configs/pile.json +9 -0
configs/gpt2_small.json +36 -0
configs/gpt3_13B_256.json +40 -0
configs/gpt3_13B_256_Pile.json +38 -0
configs/gpt3_2-7B_256.json +38 -0
configs/gpt3_6-7B_256.json +36 -0
configs/gpt3_PAR_small_256.json +36 -0
configs/gpt3_XL_256_Pile.json +37 -0
configs/gpt3_large_256.json +39 -0
configs/gpt3_medium_256.json +36 -0
configs/gpt3_small_256.json +36 -0
data/create_tfrecords.py +263 -0
data/encoders.py +28 -0
data/train_tokenizer.py +73 -0
docker-compose.yml +67 -0
encoders.py +28 -0
export.py +14 -0
gradio/demo.py +12 -0
inputs.py +384 -0
main.py +257 -0
model_fns.py +305 -0
models/activations.py +95 -0
models/gpt2/gpt2.py +217 -0
models/layers.py +357 -0
models/utils.py +124 -0
optimizers.py +176 -0
requirements.txt +18 -0
run_experiment.py +265 -0
sample.py +218 -0
tasks.py +116 -0
test_models.py +180 -0
utils.py +291 -0

.idea/gpt-neo.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/gpt-neo.iml" filepath="$PROJECT_DIR$/.idea/gpt-neo.iml" />
+    </modules>
+  </component>
+</project>

CODEOWNERS ADDED Viewed

	@@ -0,0 +1 @@


1	+ * EleutherAI/pm-gptneo

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM gcr.io/deeplearning-platform-release/tf-cpu.1-15
+WORKDIR /neogpt
+# Make RUN commands use `bash --login`:
+SHELL ["/bin/bash", "--login", "-c"]
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install tmux -y
+RUN conda install gcc_linux-64 gxx_linux-64 -y
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+RUN apt-get install screen htop -y
+RUN python -m pip install tensorboard==1.15 cloud_tpu_profiler==1.15
+CMD tmux

GPTNeo_example_notebook.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 EleutherAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import gradio as gr
+title = "GPT-Neo Demo"
+description = "demo for GPT-Neo by EleutherAI for text generation. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='http://github.com/eleutherai/gpt-neo'>GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow</a></p>"
+examples = [
+    ['The tower is 324 metres (1,063 ft) tall,'],
+    ["The Moon's orbit around Earth has"],
+    ["The smooth Borealis basin in the Northern Hemisphere covers 40%"]
+]
+gr.Interface.load("huggingface/EleutherAI/gpt-neo-2.7B", inputs=gr.inputs.Textbox(lines=5, label="Input Text"),title=title,description=description,article=article, examples=examples).launch()

configs.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+from pathlib import Path
+from collections import defaultdict
+DATASETS = {}
+for path in Path("configs/dataset_configs").glob("*.json"):
+    dataset_id = path.stem
+    DATASETS[dataset_id] = json.loads(path.read_text())
+def fetch_model_params(model):
+    model_path = model if model.endswith(".json") else f"configs/{model}.json"
+    with open(model_path) as f:
+        params = json.load(f)
+    dataset_ids = []
+    for d in params.get("datasets"):
+        if isinstance(d, list):
+            dataset_ids.append(d[0])
+        else:
+            dataset_ids.append(d)
+    no_datasets = params.get("no_dataset", False)
+    assert no_datasets or len(dataset_ids) > 0, "You must specify at least one dataset id in the model config"
+    datasets = {}
+    last_dataset = None
+    for dataset_id in dataset_ids:
+        assert dataset_id in DATASETS, f"Dataset '{dataset_id}' was not found under dataset_configs/ folder. Please follow the example.json in that folder."
+        dataset = DATASETS[dataset_id]
+        assert params["n_vocab"] >= dataset["n_vocab"], f"The embedding table size '{params['n_vocab']}' must be greater or equal to the vocab size used to encode the dataset '{dataset_id}' ({dataset['n_vocab']})"
+        datasets[dataset_id] = dataset
+        last_dataset = dataset
+    if last_dataset is not None:
+        params["padding_id"] = last_dataset.get("padding_id", 0)
+        params["eos_id"] = last_dataset.get("eos_id", 1)
+    params["dataset_configs"] = datasets
+    # Set some other parameter defaults
+    params["mlm_training"] = params.get("mlm_training") == True
+    params["causal"] = not params["mlm_training"]
+    # Set all other parameter values to default to None
+    params = defaultdict(lambda: None, params)
+    return params

configs/dataset_configs/example.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+	"n_vocab": 32768,
+	"path": "./tfrecords/openwebtext_*.tfrecords",
+	"eval_path": "",
+	"tokenizer_path": "./datasets/openwebtext/byte-level-bpe.tokenizer.json",
+	"eos_id": 1,
+	"padding_id": 0
+}

configs/dataset_configs/openwebtext2_new_inputs.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+	"n_vocab": 50257,
+	"path": "gs://neo-datasets/openwebtext2_new_inputs/train/*.tfrecords",
+	"eval_path": "gs://neo-datasets/openwebtext2_new_inputs/eval/*.tfrecords",
+	"tokenizer_is_pretrained": true,
+	"tokenizer_path": "gpt2",
+	"eos_id": 50256,
+	"padding_id": 50257
+}

configs/dataset_configs/pile.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+	"n_vocab": 50257,
+	"path": "gs://neo-datasets/pile/pile_*.tfrecords",
+	"eval_path": "gs://neo-datasets/pile_val.tfrecords",
+	"tokenizer_is_pretrained": true,
+	"tokenizer_path": "gpt2",
+	"eos_id": 50256,
+	"padding_id": 50257
+}

configs/gpt2_small.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "n_head": 6,
+    "n_vocab": 50257,
+    "embed_dropout": 0.1,
+    "lr": 0.0006,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0,
+    "train_batch_size": 512,
+    "attn_dropout": 0.1,
+    "train_steps": 1000000,
+    "lr_decay_end": 300000,
+    "eval_steps": 30,
+    "predict_steps": 0,
+    "res_dropout": 0.1,
+    "eval_batch_size": 128,
+    "predict_batch_size": 8,
+    "iterations": 2500,
+    "n_embd": 768,
+    "datasets": ["openwebtext2_new_inputs"],
+    "model_path": "gs://neo-models/GPT2_SMALL",
+    "n_ctx": 1024,
+    "n_layer": 12,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],12]],
+    "activation_function": "gelu",
+    "mesh_shape": "all:64",
+    "layout": "batch:all",
+    "recompute_grad": false,
+    "gradient_clipping": 1.0
+}

configs/gpt3_13B_256.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+    "n_head": 40,
+    "n_vocab": 50257,
+    "embed_dropout": 0,
+    "lr": 0.0001,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "ada_epsilon1": 1e-30,
+    "ada_epsilon2": 1e-3,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 1024,
+    "attn_dropout": 0,
+    "train_steps": 143075,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 128,
+    "predict_batch_size": 1,
+    "iterations": 500,
+    "n_embd": 5120,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_13B",
+    "n_ctx": 2048,
+    "n_layer": 40,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global", "local"],20]],
+    "mesh_shape": "x:16,y:16",
+    "layout": "batch:x,embd:y,memory_length:y",
+    "activation_function": "gelu",
+    "recompute_grad": true,
+    "gradient_clipping": 1.0,
+    "tokens_per_mb_per_replica": 2048,
+    "precision": "bfloat16"
+}

configs/gpt3_13B_256_Pile.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "n_head": 40,
+    "n_vocab": 50257,
+    "embed_dropout": 0,
+    "lr": 0.0001,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0.1,
+    "train_batch_size": 1024,
+    "attn_dropout": 0,
+    "train_steps": 286150,
+    "eval_steps": 10,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 512,
+    "predict_batch_size": 1,
+    "iterations": 500,
+    "n_embd": 5120,
+    "datasets": [["pile", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_13B_Pile",
+    "n_ctx": 2048,
+    "n_layer": 40,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],40]],
+    "mesh_shape": "x:16,y:16",
+    "layout": "batch:x,memory_length:y,embd:y",
+    "activation_function": "gelu",
+    "recompute_grad": true,
+    "gradient_clipping": 1.0,
+    "tokens_per_mb_per_replica": 2048,
+    "precision": "bfloat16"
+}

configs/gpt3_2-7B_256.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "n_head": 32,
+    "n_vocab": 50257,
+    "embed_dropout": 0,
+    "lr": 0.00016,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "ada_epsilon1": 1e-30,
+    "ada_epsilon2": 1e-3,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 512,
+    "attn_dropout": 0,
+    "train_steps": 286150,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 128,
+    "predict_batch_size": 1,
+    "iterations": 500,
+    "n_embd": 2560,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_2-7B",
+    "n_ctx": 2048,
+    "n_layer": 32,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],32]],
+    "mesh_shape": "x:128,y:2",
+    "layout": "embd:y,batch:x",
+    "activation_function": "gelu",
+    "recompute_grad": true,
+    "gradient_clipping": 1.0
+}

configs/gpt3_6-7B_256.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "n_head": 32,
+    "n_vocab": 50257,
+    "embed_dropout": 0,
+    "lr": 0.00012,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 1024,
+    "attn_dropout": 0,
+    "train_steps": 143075,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 128,
+    "predict_batch_size": 1,
+    "iterations": 500,
+    "n_embd": 4096,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_6-7B",
+    "n_ctx": 2048,
+    "n_layer": 32,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],32]],
+    "mesh_shape": "x:128,y:2",
+    "layout": "embd:y,batch:x",
+    "activation_function": "gelu",
+    "recompute_grad": true,
+    "gradient_clipping": 1.0
+}

configs/gpt3_PAR_small_256.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "n_head": 12,
+    "n_vocab": 50304,
+    "embed_dropout": 0,
+    "lr": 0.0006,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 256,
+    "attn_dropout": 0,
+    "train_steps": 572300,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 64,
+    "predict_batch_size": 1,
+    "iterations": 1000,
+    "n_embd": 768,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_PAR_SMALL",
+    "n_ctx": 2048,
+    "n_layer": 19,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types": [[["global", "none", "none"],5], [["none"], 4]],
+    "mesh_shape": "x:64,y:4",
+    "layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y",
+    "activation_function": "gelu",
+    "recompute_grad": false,
+    "gradient_clipping": 1.0
+}

configs/gpt3_XL_256_Pile.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "n_head": 32,
+    "n_vocab": 50257,
+    "embed_dropout": 0,
+    "lr": 0.0002,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0.1,
+    "train_batch_size": 512,
+    "attn_dropout": 0,
+    "train_steps": 286150,
+    "eval_steps": 10,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 512,
+    "predict_batch_size": 1,
+    "iterations": 500,
+    "n_embd": 2048,
+    "datasets": [["pile", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_XL_Pile",
+    "n_ctx": 2048,
+    "n_layer": 24,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],24]],
+    "mesh_shape": "x:128,y:2",
+    "layout": "batch:x,memory_length:y,embd:y",
+    "activation_function": "gelu",
+    "recompute_grad": true,
+    "gradient_clipping": 1.0,
+    "tokens_per_mb_per_replica": 2048,
+    "precision": "bfloat16"
+}

configs/gpt3_large_256.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "n_head": 16,
+    "n_vocab": 50304,
+    "embed_dropout": 0,
+    "lr": 0.00025,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "ada_epsilon1": 1e-30,
+    "ada_epsilon2": 1e-3,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 256,
+    "attn_dropout": 0,
+    "train_steps": 572300,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 64,
+    "predict_batch_size": 1,
+    "iterations": 2500,
+    "n_embd": 1536,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_LARGE",
+    "n_ctx": 2048,
+    "n_layer": 24,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],24]],
+    "mesh_shape": "x:64,y:4",
+    "layout": "batch:x,vocab:y,heads:y",
+    "activation_function": "gelu",
+    "recompute_grad": true,
+    "gradient_clipping": 1.0,
+    "tokens_per_mb_per_replica": 2048
+}

configs/gpt3_medium_256.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "n_head": 16,
+    "n_vocab": 50304,
+    "embed_dropout": 0,
+    "lr": 0.0003,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 256,
+    "attn_dropout": 0,
+    "train_steps": 572300,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 64,
+    "predict_batch_size": 1,
+    "iterations": 2500,
+    "n_embd": 1024,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_MEDIUM",
+    "n_ctx": 2048,
+    "n_layer": 24,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types" :  [[["global"],24]],
+    "mesh_shape": "x:64,y:4",
+    "layout": "batch:x,heads:y,vocab:y",
+    "activation_function": "gelu",
+    "recompute_grad": false,
+    "gradient_clipping": 1.0
+}

configs/gpt3_small_256.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "n_head": 12,
+    "n_vocab": 50304,
+    "embed_dropout": 0,
+    "lr": 0.0006,
+    "lr_decay": "cosine",
+    "warmup_steps": 3000,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "epsilon": 1e-8,
+    "opt_name": "adam",
+    "weight_decay": 0.10,
+    "train_batch_size": 256,
+    "attn_dropout": 0,
+    "train_steps": 572300,
+    "eval_steps": 0,
+    "predict_steps": 1,
+    "res_dropout": 0,
+    "eval_batch_size": 64,
+    "predict_batch_size": 1,
+    "iterations": 2500,
+    "n_embd": 768,
+    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "model_path": "gs://neo-models/GPT3_SMALL",
+    "n_ctx": 2048,
+    "n_layer": 12,
+    "scale_by_depth": true,
+    "scale_by_in": false,
+    "attention_types": [[["global"],12]],
+    "mesh_shape": "x:64,y:4",
+    "layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y",
+    "activation_function": "gelu",
+    "recompute_grad": false,
+    "gradient_clipping": 1.0
+}

data/create_tfrecords.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import argparse
+import os
+from pathlib import Path
+import ftfy
+import tensorflow as tf
+from lm_dataformat import Reader
+from tokenizers import Tokenizer
+from transformers import GPT2TokenizerFast
+from tqdm import tqdm
+import logging
+from multiprocessing import Pool, cpu_count
+from itertools import repeat
+import re
+logging.getLogger("transformers").setLevel(logging.ERROR)
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_dir", type=str, help="Path to where your files are located. Files ending in .zst are "
+                                                  "treated as archives, all others as raw text.")
+parser.add_argument("--files_per", type=int, default=100000, help="Text files per tfrecord")
+parser.add_argument("--name", type=str, default="openwebtext",
+                    help="Name of output files will be name_i.tfrecords where i is the number of the file")
+parser.add_argument("--output_dir", type=str, default="./tfrecords", help="Where to put tfrecords")
+parser.add_argument("--encoder_path", type=str,
+                    help="Path to encoder files, or leave unspecified to use GPT2 tokenizer")
+parser.add_argument("--minimum_size", type=int, default=100, help="Minimum size a document has to be to be included")
+parser.add_argument("--ftfy", action="store_false", help="normalize with ftfy")
+parser.add_argument("--wikitext-detokenize", action="store_false", help="use wikitext detokenizer")
+parser.add_argument("--separator", nargs="+", type=int, default=[50256],
+                    help="separator to place between files in chunk mode")
+parser.add_argument("--chunk_size", type=int, default=2048, help="How big a chunk should be in chunk mode. "
+                                                                 "Should equal your model's context size")
+parser.add_argument("--write_dataset_config", action="store_true", help="Write the dataset config file on completion")
+parser.add_argument("--processes", type=int, default=0, help="Number of processes to use. Defaults to cpu count.")
+args = parser.parse_args()
+if not args.output_dir.endswith("/"):
+    args.output_dir = args.output_dir + "/"
+if not args.input_dir.endswith("/"):
+    args.input_dir = args.input_dir + "/"
+assert len(args.separator) == 1
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+    return string
+def _int64_feature(value):
+    """
+    Returns an int64_list from a bool / enum / int / uint.
+    """
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+def write_to_file(writer, data):
+    """
+    writes data to tfrecord file
+    """
+    feature = {
+        "text": _int64_feature(data)
+    }
+    tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
+    writer.write(tf_example.SerializeToString())
+def get_tokenizer(args):
+    if args.encoder_path is None:
+        return GPT2TokenizerFast.from_pretrained('gpt2')
+    else:
+        return Tokenizer.from_file(args.encoder_path)
+def split_list(l, n):
+    # splits list/string into n size chunks
+    return [l[i:i + n] for i in range(0, len(l), n)]
+def archive_to_tokens(f, encoder, args):
+    # Generator that yields the contents of the files in an archive
+    # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data
+    reader = Reader(f)
+    for doc in reader.stream_data(threaded=False):
+        if args.ftfy:  # fix text with ftfy if specified
+            doc = ftfy.fix_text(doc, normalization='NFKC')
+        if args.wikitext_detokenize:
+            doc = wikitext_detokenizer(doc)
+        doc = encoder.encode(doc) + args.separator  # read document from lmd and append separator token
+        yield split_list(doc, args.chunk_size)  # split into n_ctx + 1 size chunks
+def write_files(files, files_per, output_dir, out_name, start_no, write_remainder=False, process_no=None):
+    # writes a list of files to .tfrecords
+    if files == None:
+        return
+    chunks = split_list(files, files_per)
+    if len(chunks[-1]) != files_per and not write_remainder:  # pop the last file if it's length != files per
+        remainder = chunks.pop(-1)
+    else:
+        remainder = None  # assuming files = remainder from an old chunk here
+        files_per = len(chunks[-1])
+    for files in chunks:
+        fp = f"{output_dir}/{out_name}_{start_no}"
+        if process_no is not None:
+            fp += f"_{process_no}"
+        fp += f"_{files_per}"  # add number of files in tfrecord to end of fp
+        fp += ".tfrecords"
+        with tf.io.TFRecordWriter(fp) as writer:
+            for f in files:
+                write_to_file(writer, f)
+        start_no += 1
+    return start_no, remainder
+def get_files(input_dir, filetypes=None):
+    # gets all files of <filetypes> in input_dir
+    if filetypes == None:
+        filetypes = ["jsonl.zst", ".txt", ".xz", ".tar.gz"]
+    files = [list(Path(input_dir).glob(f"*{ft}")) for ft in filetypes]
+    return [str(item) for sublist in files for item in sublist]  # flatten list of list -> list and stringify Paths
+def read_checkpoint(checkpoint_path, resume_from_checkpoint=True):
+    # init checkpointing
+    if resume_from_checkpoint and os.path.isfile(checkpoint_path):
+        try:
+            resume_files_processed, tfrecord_count = [int(i) for i in open(checkpoint_path, "r").read().split(", ")]
+            print(f"\nResuming from tfrecord no. {tfrecord_count} / file no. {resume_files_processed}")
+            return resume_files_processed, tfrecord_count
+        except:
+            pass
+    return 0, 0
+def create_tfrecords(params, write_remainder=True, write_every_n_files=1, save_checkpoints=False,
+                     resume_from_checkpoint=False, display_pbar=False):
+    # iterates through files in input_dir, splitting into <args.chunk_size> chunks and saving a tfrecords file every <args.files_per> chunks.
+    files, args, process_no = params
+    enc = get_tokenizer(args)  # get tokenizer
+    # init metadata
+    discarded_files = 0
+    files_processed = 0
+    pbar = tqdm(desc=f"Writing TFRecord Files to {args.output_dir}. Parsed 0 input files. files_written ",
+                disable=not display_pbar)
+    checkpoint_path = f"{args.output_dir}/checkpoint.txt"
+    resume_files_processed, tfrecord_count = read_checkpoint(checkpoint_path, resume_from_checkpoint)
+    data_to_prepend = []
+    tokenized_files_array = []
+    for f in files:
+        for tokenized_files in archive_to_tokens(f, enc, args):
+            files_processed += 1
+            if files_processed < resume_files_processed:
+                continue  # resume from checkpoint
+            # if the last chunk < chunk size, but > minimum_size, take it and append it to the beginning of the next file
+            n_tokens = len(tokenized_files[-1])
+            if n_tokens < args.chunk_size:
+                data = tokenized_files.pop(-1)
+                if n_tokens >= args.minimum_size:
+                    data_to_prepend.extend(data)
+                else:
+                    discarded_files += 1
+            if len(data_to_prepend) >= args.chunk_size:
+                # if length of data_to_prepend becomes greater than chunk size, add concatted files to tokenized files
+                tokenized_files_array.append(data_to_prepend[:args.chunk_size])
+                data_to_prepend = data_to_prepend[args.chunk_size:]
+            # add tokenized files > chunk size to main array
+            tokenized_files_array.extend(tokenized_files)
+            if len(tokenized_files_array) >= args.files_per * write_every_n_files:  # write every n files
+                _tfrecord_count, remainder = write_files(tokenized_files_array, files_per=args.files_per,
+                                                         output_dir=args.output_dir, out_name=args.name,
+                                                         start_no=tfrecord_count, process_no=process_no)
+                pbar.update(_tfrecord_count - tfrecord_count)  # update progress bar
+                pbar.set_description(
+                    f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written ")
+                tfrecord_count = _tfrecord_count
+                tokenized_files_array = remainder if remainder is not None else []  # add remaining files to next chunk
+                with open(checkpoint_path, "w") as checkpoint_file:
+                    checkpoint_file.write(f"{files_processed}, {tfrecord_count}")
+    if len(tokenized_files_array) >= args.files_per:  # also write at end
+        _tfrecord_count, remainder = write_files(tokenized_files_array, files_per=args.files_per,
+                                                 output_dir=args.output_dir, out_name=args.name,
+                                                 start_no=tfrecord_count, process_no=process_no)
+        pbar.update(_tfrecord_count - tfrecord_count)
+        pbar.set_description(
+            f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written ")
+        tfrecord_count = _tfrecord_count
+        with open(checkpoint_path, "w") as checkpoint_file:
+            checkpoint_file.write(f"{files_processed}, {tfrecord_count}")
+    else:
+        remainder = tokenized_files_array  # add remaining to remainder
+    if write_remainder:
+        # write out the remaining files even if there's less than files_per
+        write_files(remainder, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name,
+                    start_no=tfrecord_count, write_remainder=True)
+    successful_files = files_processed - discarded_files
+    return {"discarded": discarded_files, "processed": files_processed, "successful": successful_files}
+def create_tfrecords_mp(files, args):
+    files = split_list(files, len(files) // args.processes)
+    with Pool(processes=args.processes) as pool:
+        pbar = tqdm(pool.imap(create_tfrecords, zip(files, repeat(args), range(len(files)))))
+        meta = {"discarded": 0, "processed": 0, "successful": 0}
+        for results in pbar:
+            pbar.update()
+            for k, v in results.items():
+                meta[k] += v  # update metadata
+        return meta
+if __name__ == "__main__":
+    os.makedirs(args.output_dir, exist_ok=True)  # make output dir if it doesn't exist
+    files = get_files(args.input_dir)
+    args.chunk_size += 1  # we shift the data by 1 to the right for targets, so increment the chunk size here
+    if args.processes == 0:
+        args.processes = cpu_count()
+    if args.processes > 1:
+        results = create_tfrecords_mp(files, args)
+    else:
+        results = create_tfrecords((files, args, 0), display_pbar=True)
+    print(results)

data/encoders.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from tokenizers import Tokenizer
+from transformers import GPT2Tokenizer, GPT2TokenizerFast
+def fetch_encoder(params):
+    no_dataset = params.get('no_dataset', False)
+    if no_dataset:
+        return None
+    dataset = next(iter(params['dataset_configs'].values())) # Get the first value from the dict
+    path = dataset["tokenizer_path"]
+    is_pretrained = dataset.get("tokenizer_is_pretrained", False)
+    if is_pretrained:
+        tok = GPT2TokenizerFast.from_pretrained(path)
+        # Will add a padding token id of 50257 at run-time
+        tok.add_special_tokens({'pad_token': '<|padding|>'})
+        return tok
+    return Tokenizer.from_file(path)
+# GPT2Tokenizer and Tokenizer have different ways of fetching token ids
+def encode(encoder, text):
+    result = encoder.encode(text)
+    if isinstance(result, list):
+        return result
+    return result.ids

data/train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import random
+import argparse
+import shutil
+from glob import glob
+from pathlib import Path
+from lm_dataformat import Reader
+from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
+                        processors, trainers)
+from tokenizers.normalizers import NFKC
+from tqdm import tqdm
+# parser
+parser = argparse.ArgumentParser()
+parser.add_argument("--base_dir", type=str, help="Path to where your files are located. Files ending in .zst are treated as \
+                    archives, all others as raw text.")
+parser.add_argument("--output_dir", type=str, default="tokenizers", help="Where to put the tokenizer")
+parser.add_argument("--file_type", type=str, choices=["xz", "txt"], default="xz", help="Extension of file to parse")
+parser.add_argument("--vocab_size", type=int, help="Size of vocabulary", required = True)
+args = parser.parse_args()
+# main script
+data_path = Path(args.base_dir)
+archives = glob(str(data_path / f"*.{args.file_type}"))
+out_path = Path(args.output_dir)
+if os.path.exists(out_path):
+    shutil.rmtree(out_path)
+if not out_path.is_dir():
+    out_path.mkdir()
+    for arch in tqdm(archives):
+        name = os.path.basename(arch).split(".")[0] + ".txt"
+        fp = out_path / name
+        if args.file_type == 'xz':
+            g = Reader(arch).stream_data()
+            with open(fp, "w") as f:
+                for s in g:
+                    f.write(s)
+                    f.write("\n\n")
+        elif args.file_type == 'txt':
+            shutil.copyfile(str(arch), str(fp))
+data_files = glob(str(out_path / "*.txt"))
+data_files = random.sample(data_files, int(0.2 * len(data_files)))
+assert len(data_files) > 0, 'No data files found'
+# Initialize a tokenizer
+tokenizer = Tokenizer(models.BPE())
+# Customize pre-tokenization and decoding
+tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
+tokenizer.decoder = decoders.ByteLevel()
+tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
+tokenizer.normalizer = NFKC()
+# And then train
+trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"])
+tokenizer.train(trainer, data_files)
+# And Save it
+tokenizer_path = out_path / "byte-level-bpe.tokenizer.json"
+tokenizer.save(str(tokenizer_path), pretty=True)
+print(f'tokenizer saved at {str(tokenizer_path)}')

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+version: '3'
+services:
+  mongo:
+    image: mongo
+    ports:
+      - 127.0.0.1:27017:27017
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: user
+      MONGO_INITDB_ROOT_PASSWORD: password
+      MONGO_INITDB_DATABASE: db
+    expose:
+      - 27017
+    networks:
+      - omniboard
+    volumes:
+      - ./data:/data/db
+  mongoClientTemp:
+   image: mongo:latest
+   container_name: mongoClientTemp
+   links:
+    - mongo:mongo
+   command: mongo --host mongo -u user -p password --eval  "db.getSiblingDB('db').createUser({user:'readonly', pwd:'password', roles:[{role:'read',db:'db'}]});"
+   depends_on:
+    - mongo
+   networks:
+    - omniboard
+  omniboard_readonly:
+          #image: vivekratnavel/omniboard:latest
+    build: https://github.com/lucidrains/omniboard.git
+    command: ["--mu", "mongodb://readonly:password@mongo:27017/db"]
+    ports:
+            - 0.0.0.0:8081:9000
+    networks:
+      - omniboard
+    depends_on:
+      - mongo
+  omniboard:
+          #image: vivekratnavel/omniboard:latest
+    build: https://github.com/lucidrains/omniboard.git
+    command: ["--mu", "mongodb://user:password@mongo:27017/db?authSource=admin"]
+    expose:
+      - 9000
+    networks:
+      - omniboard
+    depends_on:
+      - mongo
+  nginx:
+    image: dhswt/nginx-basic-auth:1.3
+    environment:
+      - HTPASSWD=isaac: #put passwd here
+      - FORWARD_HOST=omniboard
+      - FORWARD_PORT=9000
+    networks:
+      - omniboard
+    depends_on:
+      - omniboard
+    ports:
+            - 0.0.0.0:8080:80
+    expose:
+      - 8080
+networks:
+  omniboard:

encoders.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from tokenizers import Tokenizer
+from transformers import GPT2Tokenizer, GPT2TokenizerFast
+def fetch_encoder(params):
+    no_dataset = params.get('no_dataset', False)
+    if no_dataset:
+        return None
+    dataset = next(iter(params['dataset_configs'].values())) # Get the first value from the dict
+    path = dataset["tokenizer_path"]
+    is_pretrained = dataset.get("tokenizer_is_pretrained", False)
+    if is_pretrained:
+        tok = GPT2TokenizerFast.from_pretrained(path)
+        # Will add a padding token id of 50257 at run-time
+        tok.add_special_tokens({'pad_token': '<|padding|>'})
+        return tok
+    return Tokenizer.from_file(path)
+# GPT2Tokenizer and Tokenizer have different ways of fetching token ids
+def encode(encoder, text, gpt=True):
+    result = encoder.encode(text)
+    if isinstance(result, list):
+        return result
+    return result.ids

export.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import tensorflow.compat.v1 as tf
+def export_model(estimator, export_dir, params,
+                 checkpoint_path=None):
+    def serving_input_receiver_fn():
+        t = tf.placeholder(dtype=tf.int64,
+                            shape=[1, params["n_ctx"]],
+                            name='input_example_tensor')
+        return tf.estimator.export.ServingInputReceiver(t, t)
+    return estimator.export_saved_model(
+        export_dir, serving_input_receiver_fn, checkpoint_path=checkpoint_path)

gradio/demo.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import gradio as gr
+title = "GPT-Neo Demo"
+description = "demo for GPT-Neo by EleutherAI for text generation. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='http://github.com/eleutherai/gpt-neo'>GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow</a></p>"
+examples = [
+    ['The tower is 324 metres (1,063 ft) tall,'],
+    ["The Moon's orbit around Earth has"],
+    ["The smooth Borealis basin in the Northern Hemisphere covers 40%"]
+]
+gr.Interface.load("huggingface/EleutherAI/gpt-neo-2.7B", inputs=gr.inputs.Textbox(lines=5, label="Input Text"),title=title,description=description,article=article, examples=examples).launch()

inputs.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import numpy as np
+import tensorflow.compat.v1 as tf
+from functools import partial
+from data.encoders import encode
+import random
+import re
+import logging
+from itertools import cycle
+from utils import natural_sort
+### IN USE ###
+def _get_number_of_documents(filename):
+    # extracts number of files from a filename formatted "<name>_<num_documents>.tfrecords."
+    # if no pattern is matched, returns None
+    match = re.search("_(\d{1,}).tfrecords$", filename)
+    return int(match.group(1)) if match is not None else match
+def _get_number_of_documents_by_iteration(filename):
+    # extracts number of files from a tfrecord document in the event it doesn't have metadata in the filename
+    # this could be very slow.
+    logging.warning(
+        "inputs/sequential_input() found no metadata found in filename - iterating through first tfrecord to find global length")
+    count = 0
+    for item in tf.io.tf_record_iterator(filename):
+        count += 1
+    return count
+def _get_skip_index(all_files, n_batches):
+    prev_cumsum = 0
+    cumsum = 0
+    global_n_documents = None
+    for count, f in cycle(enumerate(all_files)):
+        prev_cumsum = cumsum
+        if _get_number_of_documents(f) is not None:
+            cumsum += _get_number_of_documents(f)
+        elif global_n_documents is None:
+            global_n_documents = _get_number_of_documents_by_iteration(f)
+            cumsum += global_n_documents
+        else:
+            cumsum += global_n_documents
+        if cumsum == n_batches:
+            remainder = 0
+            skip_idx = count + 1
+        elif cumsum > n_batches:
+            remainder = n_batches - prev_cumsum
+            skip_idx = count
+            break
+    return skip_idx, remainder
+def _parse_function(example_proto):
+    features = {
+        "text": tf.VarLenFeature(tf.int64)
+    }
+    parsed_features = tf.parse_single_example(example_proto, features)
+    return tf.sparse.to_dense(parsed_features["text"], parsed_features["text"].dense_shape[0])
+def autoregressive_sample_text(params, x):
+    vals1 = x[:params["n_ctx"]]
+    vals2 = x[1:params["n_ctx"] + 1]
+    vals1 = tf.reshape(vals1, [params["n_ctx"]])
+    vals2 = tf.reshape(vals2, [params["n_ctx"]])
+    vals1 = tf.cast(vals1, dtype=tf.int32)
+    vals2 = tf.cast(vals2, dtype=tf.int32)
+    return vals1, vals2
+def sequential_input(params, global_step=None, eval=False):
+    """
+    Input fn that reads tfrecords encoded with a fixed chunk size (== n_ctx + 1), and that either:
+        - has the number of documents for each tfrecord file encoded in the title in the format
+          <name>_<n_documents>.tfrecords.
+          OR
+        - has a fixed number of documents per tfrecord file.
+    If the glob pattern above isn't matched, we assume that each document has the same number of samples as the first tfrecord read.
+    If this isn't the case, it may result in errors, or some samples being missed.
+    This means we can calculate the number of samples we've seen so far using the global step,
+    and can use dataset.skip() to iterate through the list of filenames, as opposed to the whole dataset, which is incredibly inefficient.
+    If training is starting and stopping often, as with TPU pre-emption, reading the whole dataset sequentially appears to improve model
+    performance, as it results in less repeated data.
+    """
+    if not eval:
+        assert global_step is not None
+    logging.warning(
+        "Changing batch size with sequential_input() will result in some data being skipped or repeated. Please ensure your batch size stays constant throughout training.")
+    batch_size = params['eval_batch_size' if eval else 'train_batch_size']
+    filenames = []
+    for dataset_config in params['dataset_configs'].values():  # iterate through each dataset and read params
+        path_key = 'path' if not eval else 'eval_path'
+        path = dataset_config[path_key]
+        filenames.extend(
+            tf.io.gfile.glob(path))  # then glob all files that fit the pattern specified in dataset_configs
+    filenames = natural_sort(filenames)
+    shuffle_filenames = params.get("shuffle_input_filenames", True)
+    if shuffle_filenames:
+        seed = params.get('seed', 1)  # shuffle deterministically
+        random.seed(seed)
+        random.shuffle(filenames)
+    dataset = tf.data.Dataset.from_tensor_slices(filenames).repeat()  # repeat filenames to infinity
+    if not eval:
+        # skip forward first in the filenames list, then skip the remaining amount in the parsed tfrecords files
+        skip_idx, remainder = _get_skip_index(filenames, n_batches=global_step * params[
+            "train_batch_size"])  # TODO: fix for > 1 epoch
+        dataset = dataset.skip(skip_idx)  # skip to skip idx
+        # read tfrecord examples and skip remainder
+        dataset = dataset.apply(tf.data.TFRecordDataset)
+        dataset = dataset.skip(remainder)
+    else:
+        # shuffle filenames if in eval mode
+        dataset = dataset.shuffle(len(filenames))
+        dataset = dataset.apply(tf.data.TFRecordDataset)
+    # parse the tokenized data from the tfrecord files and shuffle
+    dataset = dataset.map(_parse_function, num_parallel_calls=1)
+    dataset = dataset.map(partial(autoregressive_sample_text, params), num_parallel_calls=1)
+    # batch data and repeat to infinity
+    dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(params["iterations"] * 2)
+    return dataset.repeat()
+def pred_input(params, logger, enc=None,
+               path_to_prompt=""):
+    unicorns = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+               "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+               "researchers was the fact that the unicorns spoke perfect English."
+    text = unicorns if path_to_prompt == "" else open(path_to_prompt, "r").read()
+    tokens = encode(enc, text)
+    if len(tokens) > params["n_ctx"]:
+        logger.info("The length of your input prompt is longer than the model's context length - truncating input.")
+        tokens = tokens[len(tokens) - params["n_ctx"]:]
+    if len(tokens) < params["n_ctx"]:
+        tokens = tf.pad(tokens, [[0, params["n_ctx"] - len(tokens)]], constant_values=params["padding_id"])
+    t = tf.broadcast_to(tokens, [params["batch_size"], params["n_ctx"]])
+    dataset = tf.data.Dataset.from_tensors(t)
+    def _dummy_labels(x):
+        return x, x
+    dataset = dataset.map(_dummy_labels)
+    return dataset
+def handle_pred_output(predictions, logger, enc, params, out_name="test"):
+    with tf.gfile.Open(f"{out_name}.txt", "w") as f:
+        for i, p in enumerate(predictions):
+            p = p["outputs"]
+            # remove eos + padding ids from output
+            idx = np.argmax(p == params['eos_id'])
+            if idx > 0:
+                p = p[:idx]
+            idx = np.argmax(p == params['padding_id'])
+            if idx > 0:
+                p = p[:idx]
+            text = enc.decode(p)
+            f.write("=" * 40 + " SAMPLE " + str(i) + " " + "=" * 40 + "\n")
+            f.write(text)
+            f.write("\n" + "=" * 80 + "\n")
+            logger.info("=" * 40 + " SAMPLE " + str(i) + " " + "=" * 40 + "\n")
+            logger.info(text)
+            logger.info("\n" + "=" * 80 + "\n")
+### DEPRECATED ###
+def generic_text(params, eval=False, sample_text_fn=None, **kwargs):
+    logging.warning("DEPRECATION WARNING: generic_text will be phased out in future versions.")
+    i = 0 if not eval else 1
+    weights = []
+    datasets = []
+    for dataset in params["datasets"]:
+        dataset_id, stitch, datatype, weight = dataset
+        assert dataset_id in params[
+            'dataset_configs'], f'Unknown dataset id {dataset_id} given. Please make sure your dataset ids contain that configuration'
+        dataset_config = params['dataset_configs'][dataset_id]
+        path_key = 'path' if not eval else 'eval_path'
+        path = dataset_config[path_key]
+        datasets.append(text_dataset(
+            tf.io.gfile.glob(path),
+            params,
+            stitch=stitch,
+            datatype=datatype,
+            batch=False,
+            sample_text_fn=sample_text_fn
+        ))
+        weights.append(weight)
+    batch_size = params['eval_batch_size' if eval else 'train_batch_size']
+    seed = params.get('seed', None)
+    dataset = tf.data.experimental.sample_from_datasets(datasets, weights=weights, seed=seed)
+    dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(params["iterations"] * 2)
+    return dataset
+def text_dataset(files, params, stitch, datatype, batch=True, sample_text_fn=None):
+    seed = params.get('seed', None)
+    deterministic = seed is not None
+    num_parallel_calls = 1 if deterministic else tf.data.experimental.AUTOTUNE
+    dataset = tf.data.Dataset.from_tensor_slices(files)
+    if deterministic:
+        dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=4)
+    else:
+        dataset = dataset.apply(
+            tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, cycle_length=4, sloppy=False))
+    if "documents" in datatype:
+        def _parse_function(example_proto):
+            features = {
+                # "hash": tf.VarLenFeature(tf.string),
+                "text": tf.VarLenFeature(tf.int64)
+            }
+            parsed_features = tf.parse_single_example(example_proto, features)
+            return parsed_features["text"], parsed_features["text"].dense_shape[0]
+    else:
+        def _parse_function(example_proto):
+            features = {
+                "text": tf.VarLenFeature(tf.int64)
+            }
+            parsed_features = tf.parse_single_example(example_proto, features)
+            return parsed_features["text"]  # Assuming the text is not sparse
+    dataset = dataset.map(_parse_function, num_parallel_calls=1)
+    # Subsample method
+    if "documents" in datatype:
+        # Since samples can be less than the correct length, and TPUs don't like variable lengths, this function stitches together enough samples
+        # to have a text at least 1024 tokens long. For this to work the stitch parameter must be correctly tuned so that
+        # stitch * min(characters_in_text) >= amount
+        def _stitch_text(x, y):
+            x = tf.sparse.to_dense(x)
+            def _get_x(i):
+                return tf.gather(x[i], tf.range(y[i]))
+            out = _get_x(0)
+            eos_id = params['eos_id']
+            for i in range(1, stitch):
+                out = tf.concat([out, [eos_id], _get_x(i)], axis=0)  # text1<|endoftext|>text2
+            return out
+        # Hack-y way to stitch together multiple texts
+        dataset = dataset.shuffle(1000 * stitch, seed=seed).batch(stitch, drop_remainder=True).map(_stitch_text,
+                                                                                                   num_parallel_calls=num_parallel_calls)
+        # Sample 1024(+1) tokens from the stitched together text
+        is_random_documents = datatype == "documents_random"
+        if sample_text_fn is not None:
+            _sample_text = partial(sample_text_fn, random_documents=is_random_documents)
+        else:
+            _sample_text = autoregressive_sample_text_random_documents if is_random_documents else autoregressive_sample_text
+            _sample_text = partial(_sample_text, params)
+        dataset = dataset.map(_sample_text, num_parallel_calls=num_parallel_calls)
+    if batch:
+        dataset = dataset.batch(params["train_batch_size"], drop_remainder=True).prefetch(params["iterations"] * 2)
+    dataset = dataset.repeat()
+    return dataset
+def autoregressive_sample_text_random_documents(params, x):
+    seed = params.get('seed', None)
+    s = tf.size(x)
+    r = tf.random.uniform([], maxval=s - (params["n_ctx"] + 1), dtype=tf.dtypes.int32, seed=seed)
+    r1 = tf.range(r, r + params["n_ctx"])
+    r2 = tf.range(r + 1, (r + 1) + params["n_ctx"])
+    r1 = tf.reshape(r1, [params["n_ctx"]])  # Somehow, this makes the compiler happy
+    r2 = tf.reshape(r2, [params[
+                             "n_ctx"]])  # TPUs want constant sized input, and these reshapes makes it recognize the shape of the input
+    vals1 = tf.gather(x, r1)
+    vals2 = tf.gather(x, r2)
+    vals1 = tf.reshape(vals1, [params["n_ctx"]])
+    vals2 = tf.reshape(vals2, [params["n_ctx"]])
+    vals1 = tf.cast(vals1, dtype=tf.int32)
+    vals2 = tf.cast(vals2, dtype=tf.int32)
+    return vals1, vals2
+def mlm_sample_text(params, x, random_documents=False):
+    seed = params.get('seed', None)
+    ctx_len = params["n_ctx"]
+    assert 'mlm_mask_id' in params, 'the key `mlm_mask_id` must be set on your config to do masked language model training, specifying the id of the reserved mask token'
+    mask_id = params['mlm_mask_id']
+    cls_token_id = params.get('mlm_cls_token_id', None)
+    num_tokens = params.get('n_vocab', None)
+    mask_ignore_ids = set(params.get('mlm_mask_ignore_ids', []))
+    mask_ignore_ids.add(cls_token_id)
+    mask_prob = params.get('mlm_mask_prob', 0.15)
+    same_token_prob = params.get('mlm_same_token_prob', 0.10)
+    random_token_prob = params.get('mlm_random_token_prob', 0.)
+    seq_len = ctx_len if cls_token_id is None else (ctx_len - 1)
+    if random_documents:
+        s = tf.size(x)
+        r = tf.random.uniform([], maxval=(s - seq_len), dtype=tf.dtypes.int32, seed=seed)
+        r1 = tf.range(r, r + seq_len)
+        r1 = tf.reshape(r1, [seq_len])
+        features = tf.gather(x, r1)
+    else:
+        features = x[:seq_len]
+    # add cls token id if specified by `mlm_cls_token_id`
+    if cls_token_id is not None:
+        features = tf.pad(features, [[1, 0]], constant_values=cls_token_id)
+    features = tf.cast(features, dtype=tf.int32)
+    shape = features.shape
+    # determine which tokens are mask-able
+    can_mask = tf.not_equal(features, 0)
+    for ignore_id in mask_ignore_ids:
+        can_mask &= tf.not_equal(features, ignore_id)
+    # generate boolean mask for masking ids
+    mask_mask = tf.less(tf.random.uniform(shape, minval=0., maxval=1., dtype=tf.float32, seed=seed), mask_prob)
+    mask_mask &= can_mask
+    # generate mask for actually replacing the tokens, for allowing a small number of tokens to stay the same
+    replace_mask = tf.less(tf.random.uniform(shape, minval=0., maxval=1., dtype=tf.float32, seed=seed),
+                           1 - same_token_prob)
+    # randomly replace some tokens with random tokens before masking
+    if random_token_prob > 0:
+        random_token_mask = tf.less(tf.random.uniform(shape, minval=0., maxval=1., dtype=tf.float32, seed=seed),
+                                    random_token_prob)
+        random_tokens = tf.random.uniform(shape, minval=1, maxval=num_tokens, dtype=tf.dtypes.int32, seed=seed)
+        # make sure random tokens do not include illegal token ids specified by `mlm_mask_ignore_ids`
+        random_can_mask = tf.not_equal(random_tokens, 0)
+        for ignore_id in mask_ignore_ids:
+            random_can_mask &= tf.not_equal(random_tokens, ignore_id)
+        features = tf.where(random_token_mask & random_can_mask, random_tokens, features)
+    # mask the tokens
+    mask_tokens = tf.ones(shape, dtype=tf.int32) * mask_id
+    masked_features = tf.where(mask_mask & replace_mask, mask_tokens, features)
+    # labels will be set to 0 for all non-masked tokens
+    labels = tf.where(mask_mask, tf.zeros(shape, dtype=tf.int32), features)
+    masked_features, labels = map(lambda t: tf.reshape(t, [ctx_len]), (masked_features, labels))
+    return masked_features, labels

main.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""GPT-like model in Mesh-Tensorflow"""
+from functools import partial
+import mesh_tensorflow as mtf
+import tensorflow.compat.v1 as tf
+from tensorflow.python.tpu import tpu_config, tpu_estimator
+from tensorflow_estimator.python.estimator import estimator as estimator_lib
+from utils import save_config, expand_attention_types_params, yes_or_no, remove_gs_or_filepath, setup_logging, \
+    check_dataset
+from inputs import sequential_input, pred_input, handle_pred_output, mlm_sample_text, generic_text
+from export import export_model
+from model_fns import model_fn
+from data.encoders import fetch_encoder
+from configs import fetch_model_params
+from tasks import task_descriptors
+import argparse
+import json
+import numpy
+def parse_args():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tpu", type=str, help="Name of TPU to train on, if any.")
+    parser.add_argument("--gpu_ids", nargs="+", type=str, default=["device:GPU:0"],
+                        help="If training on GPU, can specify your GPU names in a list - i.e 'device:GPU:0 device:GPU:1'")
+    parser.add_argument("--model", type=str, default=None, help="JSON file that contains model parameters.")
+    parser.add_argument("--steps_per_checkpoint", type=int, default=5000, help="Save a model checkpoint every X steps.")
+    parser.add_argument("--auto_layout", action="store_true", help="If set, generates and prints the most memory "
+                                                                   "efficient layout according to MTF auto layout.")
+    parser.add_argument("--auto_layout_and_mesh_shape", action="store_true",
+                        help="If set, generates and prints the most memory efficient layout and mesh shape according to"
+                             " MTF auto layout.")
+    parser.add_argument("--new", action="store_true", help="If set, deletes previous checkpoint, if it exists, and "
+                                                           "starts a new training run")
+    parser.add_argument("--predict", action="store_true", help="If set, uses the model to predict rather than train.")
+    parser.add_argument("--eval", action="store_true", help="If set, run model in evaluation mode.")
+    parser.add_argument("--prompt", type=str, help="path to .txt file containing a prompt for prediction. If empty, "
+                                                   "defaults to unicorns.",
+                        default="")
+    parser.add_argument("--check_dataset", action="store_true",
+                        help="If set, outputs sample from the dataset and quits.")
+    parser.add_argument("--sacred_id", type=str, default="nosacred", help="Sacred run id.")
+    parser.add_argument("--entmax_sampling", action="store_true", help="(experimental) use entmax sampling")
+    parser.add_argument("--export", action="store_true", help="If set, will export the model.")
+    args = parser.parse_args()
+    assert args.model is not None, "Model must be set"
+    return args
+def main(args):
+    # Setup logging
+    logger = setup_logging(args)
+    # Read params of model
+    params = fetch_model_params(args.model)
+    # Fetch appropriate input functions
+    input_fn = params.get("input_fn", "sequential_input")
+    if input_fn == "sequential_input":
+        input_fn = sequential_input
+    elif input_fn == "generic_text":
+        input_fn = generic_text
+    pred_input_fn = pred_input
+    handle_pred_output_fn = handle_pred_output
+    # get current step
+    current_step = int(estimator_lib._load_global_step_from_checkpoint_dir(params["model_path"]))
+    logger.info(f"Current step {current_step}")
+    if params["mlm_training"]:
+        mlm_sample_text_fn = partial(mlm_sample_text, params)
+        input_fn = partial(generic_text, sample_text_fn=mlm_sample_text_fn)
+        if args.check_dataset:
+            check_dataset(input_fn, params)
+    # Fetch encoder per params
+    encoder = fetch_encoder(params)
+    pred_input_fn = partial(pred_input_fn, path_to_prompt=args.prompt, logger=logger, enc=encoder)
+    # Sample from Dataset if check dataset flag is on
+    if args.check_dataset:
+        check_dataset(input_fn, params, global_step=current_step)
+    # Confirm deletion of checkpoint files if --new flag is set
+    if args.new:
+        if yes_or_no(f"Are you sure you want to remove '{params['model_path']}' to start afresh?"):
+            remove_gs_or_filepath(params["model_path"])
+        else:
+            exit()
+    # Save config to logdir for experiment management
+    save_config(params, params["model_path"])
+    # Add to params: auto_layout, auto_layout_and_mesh_shape, use_tpu, num_cores
+    mesh_shape = mtf.convert_to_shape(params["mesh_shape"])
+    params["num_cores"] = mesh_shape.size
+    params["auto_layout"] = args.auto_layout
+    params["auto_layout_and_mesh_shape"] = args.auto_layout_and_mesh_shape
+    params["use_tpu"] = True if not args.tpu is None else False
+    params["gpu_ids"] = args.gpu_ids
+    params["steps_per_checkpoint"] = args.steps_per_checkpoint
+    # Expand attention types param
+    params["attention_types"] = expand_attention_types_params(params["attention_types"])
+    assert len(params["attention_types"]) == params["n_layer"]  # Assert that the length of expanded list = num layers
+    params["predict_batch_size"] = params.get("predict_batch_size", 1)  # Default to 1
+    params["predict"] = args.predict
+    params['model'] = params.get("model", "GPT") # Default model selection to GPT since it's the only option for now
+    params["export"] = args.export
+    # Set sampling parameters
+    params["sampling_use_entmax"] = args.entmax_sampling
+    # Sample quality of MoE models suffers when using the faster sampling method, so default to slow_sampling if
+    # moe layers are present
+    params["slow_sampling"] = True if params["moe_layers"] is not None else False
+    logger.info(f"params = {params}")
+    # Get eval tasks from params
+    eval_tasks = params.get("eval_tasks", [])
+    has_predict_or_eval_steps_or_eval_tasks = params["predict_steps"] > 0 or params["eval_steps"] > 0 or len(
+        eval_tasks) > 0
+    for t in eval_tasks:
+        assert t in task_descriptors, f"Eval task '{t}' is not known"
+        task_descriptors[t]["init_fn"](params)
+    # Set up TPUs and Estimator
+    if args.tpu == "colab":
+        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver() if params["use_tpu"] else None
+    else:
+        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(args.tpu) if params["use_tpu"] else None
+    config = tpu_config.RunConfig(
+        cluster=tpu_cluster_resolver,
+        model_dir=params["model_path"],
+        save_checkpoints_steps=None,  # Disable the default saver
+        save_checkpoints_secs=None,  # Disable the default saver
+        log_step_count_steps=params["iterations"],
+        save_summary_steps=params["iterations"],
+        tpu_config=tpu_config.TPUConfig(
+            num_shards=mesh_shape.size,
+            iterations_per_loop=params["iterations"],
+            num_cores_per_replica=1,
+            per_host_input_for_training=tpu_config.InputPipelineConfig.BROADCAST))
+    estimator = tpu_estimator.TPUEstimator(
+        use_tpu=params["use_tpu"],
+        model_fn=model_fn,
+        config=config,
+        train_batch_size=params["train_batch_size"],
+        eval_batch_size=params["train_batch_size"],
+        predict_batch_size=params["predict_batch_size"],
+        params=params)
+    def _make_task_estimator(task):
+        task_params = params.copy()
+        task_params["eval_task"] = task
+        return tpu_estimator.TPUEstimator(
+            use_tpu=params["use_tpu"],
+            model_fn=model_fn,
+            config=config,
+            train_batch_size=params["train_batch_size"],
+            eval_batch_size=params["eval_batch_size"],
+            predict_batch_size=params["predict_batch_size"],
+            params=task_params)
+    eval_task_estimators = {
+        task: _make_task_estimator(task)
+        for task in eval_tasks
+    }
+    if args.export:
+        export_model(estimator, "export", params)
+        return
+    if args.predict:
+        # Predict
+        predictions = estimator.predict(input_fn=pred_input_fn)
+        logger.info("Predictions generated")
+        enc = fetch_encoder(params)
+        handle_pred_output_fn(predictions, logger, enc, params, out_name=f"predictions_{args.sacred_id}_{current_step}")
+        return
+    def save_eval_results(task, eval_results):
+        def as_python(x):
+            if isinstance(x, numpy.generic):
+                return x.item()
+            return x
+        eval_results = {k: as_python(v) for k, v in eval_results.items()}
+        with open(f'eval_{args.sacred_id}.jsonl', 'a') as fh:
+            json.dump({'task': task, 'current_step': current_step, **eval_results}, fh)
+            fh.write('\n')
+    def run_eval():
+        logger.info("Running evaluation...")
+        eval_results = estimator.evaluate(
+                input_fn=partial(input_fn, eval=True),
+                steps=params["eval_steps"])
+        logger.info(f"Eval results: {eval_results}")
+        save_eval_results('validation', eval_results)
+    def run_eval_tasks():
+        for task in eval_tasks:
+            logger.info(f"Starting evaluation task '{task}'")
+            task_info = task_descriptors[task]["get_task_info_fn"](params)
+            task_estimator = eval_task_estimators[task]
+            task_input_fn = task_descriptors[task]["input_fn"]
+            eval_results = task_estimator.evaluate(
+                input_fn=task_input_fn,
+                steps=task_info["n_steps"],
+                name=task)
+            logger.info(f"Eval task '{task}' results: {eval_results}")
+            save_eval_results(task, eval_results)
+    if args.eval:
+        run_eval_tasks()
+        if params["eval_steps"] > 0:
+            run_eval()
+        return
+    elif has_predict_or_eval_steps_or_eval_tasks:
+        # Eval and train - stop and predict and/or eval every checkpoint
+        while current_step < params["train_steps"]:
+            next_checkpoint = min(current_step + args.steps_per_checkpoint,
+                                  params["train_steps"])
+            estimator.train(input_fn=partial(input_fn, global_step=current_step, eval=False), max_steps=next_checkpoint)
+            current_step = next_checkpoint
+            if params["predict_steps"] > 0:
+                logger.info("Running prediction...")
+                predictions = estimator.predict(input_fn=pred_input_fn)
+                enc = fetch_encoder(params)
+                handle_pred_output_fn(predictions, logger, enc, params, out_name=f"predictions_{args.sacred_id}_{current_step}")
+            if params["eval_steps"] > 0:
+                run_eval()
+            if eval_tasks:
+                run_eval_tasks()
+        return
+    else:
+        # Else, just train
+        while current_step < params["train_steps"]:
+            # Else, don't stop and restart
+            estimator.train(input_fn=partial(input_fn, global_step=current_step, eval=False), max_steps=params["train_steps"])
+if __name__ == "__main__":
+    tf.disable_v2_behavior()
+    args = parse_args()
+    main(args)

model_fns.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import mesh_tensorflow as mtf
+import tensorflow.compat.v1 as tf
+from tensorflow.python.tpu import tpu_estimator
+import mesh_tensorflow.transformer as mtf_transformer
+from optimizers import get_optimizer
+from utils import (create_host_call, get_graph_info, remove_batch_from_layout, simd_mesh_setup, add_mode_to_params,
+                   get_batch_size, auto_layout, auto_layout_and_mesh_shape)
+from models.utils import biasmask_attn_weights
+from tensorflow.python.ops import resources
+from sample import sample_autoregressive
+from models.gpt2 import gpt2
+import math
+def model_fn(features, labels, mode, params):
+    # Get global step
+    global_step = tf.train.get_global_step()
+    # Construct mtf graph + mesh from params
+    graph = mtf.Graph()
+    mesh_shape = mtf.convert_to_shape(params["mesh_shape"])
+    layout_rules = mtf.convert_to_layout_rules(params["layout"])
+    # Mesh setup
+    if params["use_tpu"]:
+        var_placer, mesh_impl = simd_mesh_setup(params, mesh_shape, layout_rules)
+    else:
+        var_placer = None
+        gpu_ids = params["gpu_ids"]
+        mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
+            mesh_shape, layout_rules, gpu_ids)
+    # Trainable variable precision
+    # Store to checkpoints in master type, train in slice type, compute in activation type
+    if params["precision"] == "bfloat16":
+        variable_dtype = mtf.VariableDType(master_dtype=tf.bfloat16, slice_dtype=tf.float32,
+                                           activation_dtype=tf.bfloat16)
+    else:
+        variable_dtype = mtf.VariableDType(master_dtype=tf.float32, slice_dtype=tf.float32, activation_dtype=tf.float32)
+    # Build mtf mesh object
+    mesh = mtf.Mesh(graph, "my_mesh", var_placer)
+    # Build mtf_features & seq length dict for getting number of microbatches
+    # We need to pack inputs into a dict to pass into serialize_training_step
+    features_dict = {"inputs": features, "labels": labels}
+    sequence_length_dict = {"inputs": params["n_ctx"], "labels": params["n_ctx"]}
+    params = add_mode_to_params(params, mode)
+    batch_size = get_batch_size(params)
+    batch_dim = mtf.Dimension("batch", batch_size)
+    batch_dims = [batch_dim]
+    feature_length = sequence_length_dict["inputs"]
+    length_dim = mtf.Dimension("sequence", feature_length)
+    mtf_features = {}
+    for key, x in features_dict.items():
+        if x is not None:
+            feature_shape = mtf.Shape(batch_dims + [length_dim])
+            if type(features_dict[key]) == dict:
+                features_dict[key] = features_dict[key]["feature"]
+            x = tf.cast(features_dict[key], tf.int32)
+            x = tf.reshape(x, feature_shape.to_integer_list)
+            mtf_features[key] = mtf.import_fully_replicated(
+                mesh, x, feature_shape, name=key)
+    # Instantiate dict for dimensions, bias, etc that can be calculated here once then passed into model
+    other_features = {}
+    memory_length_dim = mtf.Dimension("memory_length", length_dim.size)
+    attn_bias = biasmask_attn_weights(mesh, length_dim, memory_length_dim, variable_dtype) if params["causal"] else None
+    # Add attn_bias into mtf_features
+    other_features["attn_bias"] = attn_bias
+    # Define other Dimensions that we'll need inside the model
+    embd_dim = mtf.Dimension("embd", params["n_embd"])
+    vocab_dim = mtf.Dimension("vocab", params["n_vocab"])
+    # We need this because gathering when both the args have the same dimension in them breaks things
+    # This dim is specifically for the weights
+    # This prevents the "Einsum has lhs dimension without corresponding rhs or output dimension." error
+    embed_sequence_dim = mtf.Dimension("embed_sequence", params["n_ctx"])
+    other_features["embd_dim"] = embd_dim
+    other_features["vocab_dim"] = vocab_dim
+    other_features["embed_sequence_dim"] = embed_sequence_dim
+    other_features["memory_length_dim"] = memory_length_dim
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        # Set up the model for prediction
+        inputs = mtf_features["inputs"]
+        if params["remove_partial_sequences"] is None:
+            params["remove_partial_sequences"] = False
+        export = params.get("export", False)
+        if not export:
+            mtf_samples = sample_autoregressive(
+                inputs, other_features=other_features, params=params, variable_dtype=variable_dtype,
+                remove_partial_sequences=params["remove_partial_sequences"], stop_at_token=params["eos_id"],
+                sampling_use_entmax=params['sampling_use_entmax'], max_steps=params["predict_max_steps"])
+        else:
+            with mtf.utils.outside_all_rewrites():
+                with tf.variable_scope('gpt2'):
+                    mtf_samples, loss, loss_batch = gpt2.model(mtf_features, other_features, params, mesh,
+                                                               variable_dtype=variable_dtype, context=None)
+        mtf_samples = mtf.anonymize(mtf_samples)
+        inputs = mtf.anonymize(inputs)
+        lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True)
+        inputs = lowering.export_to_tf_tensor(inputs)
+        outputs = lowering.export_to_tf_tensor(mtf_samples)
+        predictions = {
+            "inputs": inputs,
+            "outputs": outputs}
+        def scaffold_fn():
+            return tf.train.Scaffold(
+                local_init_op=tf.group(
+                    tf.train.Scaffold.default_local_init_op(),
+                    lowering.copy_masters_to_slices(),
+                    name="mtf_local_init_op"),
+                ready_op=tf.concat(
+                    [tf.report_uninitialized_variables(),
+                     resources.report_uninitialized_resources()],
+                    axis=0,
+                    name="mtf_ready_op"))
+        return tpu_estimator.TPUEstimatorSpec(
+            mode=tf.estimator.ModeKeys.PREDICT,
+            predictions=predictions,
+            scaffold_fn=scaffold_fn,
+            prediction_hooks=[mtf.MtfRestoreHook(lowering)])
+    # We're not predicting, so we better be training or evaluating
+    assert (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        # Gets number of microbatches per batch for serialized training
+        # if param tokens_per_mb_per_replica = None, this defaults to 1 and no microbatching is performed
+        num_microbatches = int(mtf_transformer.utils.serialize_num_microbatches(batch_dim=batch_dim,
+                                                                                sequence_length=sequence_length_dict,
+                                                                                mesh_shape=mesh_shape,
+                                                                                layout_rules=layout_rules,
+                                                                                tokens_per_microbatch_per_replica=
+                                                                                params["tokens_per_mb_per_replica"]))
+    else:
+        num_microbatches = 1
+    params["num_microbatches"] = num_microbatches  # Add num microbatches to params
+    if num_microbatches > 1:
+        # For serialize_training_step we need to modify the model to output results in a dict
+        def serialized_fn(mtf_features):
+            if params["model"] == "GPT":
+                with tf.variable_scope('gpt2'):
+                    logits, loss, loss_batch = gpt2.model(mtf_features, other_features, params, mesh,
+                                                          variable_dtype=variable_dtype)
+                return {"logits": logits, "loss": loss, "loss_batch": loss_batch}
+            else:
+                raise Exception(f"'{params['model']}' is not a valid model - please select from [GPT]")
+        # Serialize the training step - Gradients are accumulated locally and reduced once.
+        var_grads, output_dict = mtf.serialize_training_step(mtf_features, serialized_fn, batch_dim, num_microbatches)
+        loss = output_dict["loss"]
+        loss_batch = output_dict["loss_batch"]
+        logits = output_dict["logits"]
+    else:
+        # If we're not splitting into microbatches, return logits & loss as is
+        if params["model"] == "GPT":
+            with mtf.utils.outside_all_rewrites():
+                with tf.variable_scope('gpt2'):
+                    logits, loss, loss_batch = gpt2.model(mtf_features, other_features, params, mesh,
+                                                          variable_dtype=variable_dtype, context=None)
+        else:
+            raise Exception(f"'{params['model']}' is not a valid model - please select from [GPT]")
+    # Auto layout generation
+    if params["auto_layout"]:
+        auto_layout(graph, mesh_shape, logits, loss)
+    if params["auto_layout_and_mesh_shape"]:
+        auto_layout_and_mesh_shape(graph, params["num_cores"], logits, loss)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        # In TRAIN mode, get optimizer
+        if params["num_microbatches"] > 1:
+            # If we are splitting the batch into microbatches, var grads are created in the serialize_training_step fn
+            # So we pass them in here
+            _, update_ops, var_grads = get_optimizer(mesh, loss, params, variable_dtype=variable_dtype,
+                                                     inp_var_grads=var_grads)
+        else:
+            # Otherwise, they are created in the get_optimizer fn, so we leave inp_var_grads blank
+            _, update_ops, var_grads = get_optimizer(mesh, loss, params, variable_dtype=variable_dtype)
+        # Log summaries to tensorboard
+        mtf.scalar_summary("loss", loss)
+        # Log gradients if in params
+        if params["log_grads"] not in [None, False]:
+            for g in var_grads:
+                grad_norm = mtf.sqrt(mtf.reduce_sum(mtf.square(g)))
+                mtf.scalar_summary("grads/norm" + g.name[:-2], grad_norm)
+    else:
+        # For now, we can only export fully-replicated tensors.
+        # This has to be done before lowering or they will not be included in the graph
+        mean_logits = mtf.reduce_mean(logits, reduced_dim=vocab_dim)
+        max_logits = mtf.argmax(logits, vocab_dim)
+        del logits
+        fully_replicated_mean_logits = mtf.anonymize(mean_logits)
+        fully_replicated_max_logits = mtf.anonymize(max_logits)
+        fully_replicated_loss_batch = mtf.anonymize(loss_batch)
+    # Gets & prints info about no. trainable vars in the model & dimension names
+    get_graph_info(graph)
+    # 'lowers' mtf tensors into a tf graph - this enables us to export results as tf tensors
+    lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True)
+    tf_loss = lowering.export_to_tf_tensor(loss)
+    tf_loss = tf.cast(tf_loss, tf.float32)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        # Use our patched version until mtf updates theirs
+        host_call = create_host_call(params['model_path'])
+        mtf.utils.remove_summaries()
+        # Creates train_op
+        tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
+        tf_update_ops.append(tf.assign_add(global_step, 1))  # Need to manually increment global_step
+        tf.logging.info(f"tf_update_ops: {tf_update_ops}")
+        train_op = tf.group(tf_update_ops)
+    else:
+        tf_mean_logits = lowering.export_to_tf_tensor(fully_replicated_mean_logits)
+        tf_max_logits = lowering.export_to_tf_tensor(fully_replicated_max_logits)
+        tf_loss_batch = tf.to_float(lowering.export_to_tf_tensor(fully_replicated_loss_batch))
+    with mtf.utils.outside_all_rewrites():
+        # Copy master variables to slices. Must be called first.
+        restore_hook = mtf.MtfRestoreHook(lowering)
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            # Set up the checkpoint server and return the TPUEstimatorSpec
+            saver = tf.train.Saver(
+                tf.global_variables(),
+                sharded=True,
+                max_to_keep=10,
+                keep_checkpoint_every_n_hours=2,
+                defer_build=False,
+                save_relative_paths=True)
+            tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
+            saver_listener = mtf.MtfCheckpointSaverListener(lowering)
+            saver_hook = tf.train.CheckpointSaverHook(
+                params["model_path"],
+                save_steps=params["steps_per_checkpoint"],
+                saver=saver,
+                listeners=[saver_listener])
+            return tpu_estimator.TPUEstimatorSpec(
+                tf.estimator.ModeKeys.TRAIN,
+                loss=tf_loss,
+                host_call=host_call,
+                train_op=train_op,
+                training_hooks=[restore_hook, saver_hook])
+        elif mode == tf.estimator.ModeKeys.EVAL:
+            # Evaluation metrics
+            def _perplexity(loss):
+                perplexity = tf.exp(loss)
+                return tf.metrics.mean(perplexity)
+            def _bits_per_byte(loss):
+                bpb = loss * (0.29335 / math.log(2))
+                return tf.metrics.mean(bpb)
+            def _metric_fn(tf_mean_logits, tf_loss_batch):
+                mean_logits = tf.metrics.mean(tf_mean_logits)
+                loss = tf.reduce_mean(tf_loss_batch)
+                perp = _perplexity(loss)
+                bpb = _bits_per_byte(loss)
+                return {"mean_logits": mean_logits, "perplexity": perp, "bits per byte": bpb}
+            def _lambada_metric_fn(labels, tf_max_logits, tf_loss_batch):
+                eos_token = params["eos_id"]
+                answer_positions = tf.where(tf.math.not_equal(labels, eos_token))
+                correct_answers = tf.gather_nd(tf.math.equal(tf_max_logits, labels), answer_positions)
+                accuracy = tf.metrics.mean(tf.cast(correct_answers, tf.float32))
+                # I guess tf_loss_batch has z_loss and maybe other stuff added to it
+                # so maybe this should be calculated separately in the future
+                answer_loss = tf.gather_nd(tf_loss_batch, answer_positions)
+                log_perplexity = tf.metrics.mean(answer_loss)
+                return {"lambada_acc": accuracy, "lambada_log_ppl": log_perplexity}
+            eval_task = params["eval_task"]
+            if eval_task == "lambada":
+                eval_metrics = (_lambada_metric_fn, [labels, tf_max_logits, tf_loss_batch])
+            else:
+                eval_metrics = (_metric_fn, [tf_mean_logits, tf_loss_batch])
+            return tpu_estimator.TPUEstimatorSpec(
+                tf.estimator.ModeKeys.EVAL,
+                evaluation_hooks=[restore_hook],
+                loss=tf_loss,
+                eval_metrics=eval_metrics)

models/activations.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import mesh_tensorflow as mtf
+import tensorflow.compat.v1 as tf
+import random
+BASE_FNS = {'gelu': mtf.gelu,
+            'relu': mtf.relu,
+            'sigmoid': mtf.sigmoid,
+            'tanh': mtf.tanh,
+            'selu': mtf.selu,
+            'elu': mtf.elu,
+            'abs': mtf.abs,
+            'sin': mtf.sin,
+            'cos': mtf.cos,
+            'sign': mtf.sign,
+            'silu': mtf.swish,
+            'softplus': mtf.softplus
+            }
+def _arcsinh(x):
+    return mtf.log(x + mtf.sqrt(1 + x ** 2))
+def _var(x, init):
+    return mtf.get_variable(x.mesh, f"activation-{random.randint(0, 2 ** 32):x}", [],
+                            initializer=tf.constant_initializer(init), dtype=x.dtype)
+def _pos_var(x, val):
+    return mtf.softplus(_var(x, 0)) + val
+def _rrelu(x):
+    negative_scale = random.random()
+    return (negative_scale * mtf.abs(x) + x) / (1 + negative_scale)
+def _elish(x):
+    cond = mtf.cast(mtf.greater(x, 0), x.dtype)
+    exp = mtf.exp(x)
+    return cond * x / (1 + exp) + (1 - cond) * (exp - 1) / (1 / exp + 1)
+CUSTOM_FNS = {'lrelu001': lambda x: mtf.leaky_relu(x, alpha=0.01),
+              'lrelu020': lambda x: mtf.leaky_relu(x, alpha=0.20),
+              'id': lambda x: x,
+              'triangle_relax': lambda x: mtf.sin(x) - mtf.sin(3 * x) / 9 + mtf.sin(5 * x) / 25 - mtf.sin(7 * x) / 49,
+              'square_relax': lambda x: mtf.cos(x) - mtf.cos(3 * x) / 3 + mtf.cos(5 * x) / 5 - mtf.cos(7 * x) / 7,
+              'spike': lambda x: 1 / (1 + x ** 2),
+              'spike2': lambda x: mtf.exp(-x ** 2),
+              'tanhshrink': lambda x: x - tanh(x),
+              'softsign': lambda x: x / (mtf.abs(x) + 1),
+              'softmax': lambda x: mtf.softmax(x, x.shape[-1]),
+              'logsoftmax': lambda x: mtf.log_softmax(x, x.shape[-1]),
+              'bipolarsigmoid': lambda x: mtf.sigmoid(x) * 2 - 1,
+              'rrelu': _rrelu,
+              'elish': _elish,
+              'arcsinh': _arcsinh,
+              'aria': lambda x: x * (_var(x, 0) + _var(x, 1) / (
+                          _pos_var(x, 0) + _var(x, 1) * mtf.exp(_var(x, -1) * x) ** (1 / _pos_var(x, 1)))),
+              'prelu': lambda x: mtf.leaky_relu(x, alpha=_var(x, 0.2)),
+              'parcsinh': lambda x: _var(x, 1) * _arcsinh(x * _pos_var(x, 1)),
+              'psoftplus': lambda x: _var(x, 1) * mtf.softplus(x * _var(x, 1)) + _var(x, 0),
+              'proottanh': lambda x: (x ** _pos_var(x, 2) + _pos_var(x, 1)) ** (1 / _pos_var(x, 3)) * mtf.tanh(x),
+              'maxsig': lambda x: mtf.maximum(x, mtf.sigmoid(x)),
+              'cosid': lambda x: mtf.cos(x) - x,
+              'minsin': lambda x: mtf.minimum(x, mtf.sin(x)),
+              'maxtanh': lambda x: mtf.maximum(x, mtf.tanh(x)),
+              'mish': lambda x: x * mtf.tanh(mtf.softplus(x)),
+              'tanhexp': lambda x: x * mtf.tanh(mtf.exp(x)),
+              'lisht': lambda x: x * mtf.tanh(x),
+              'seagull': lambda x: mtf.log(1 + x ** 2),
+              'snake': lambda x: x + mtf.sin(x) ** 2,
+              'roottanh': lambda x: (x ** 2 + 1) ** (1 / 3) * mtf.tanh(x),
+              'softplusmone': lambda x: mtf.softplus(x) - 1
+              }
+def get_activation_fn(params):
+    if "activation_fn" in params:
+        activation_fn = params["activation_fn"]
+    else:
+        print("Defaulting to GELU activation (see here: https://arxiv.org/abs/1606.08415)")
+        activation_fn = "gelu"
+    if activation_fn in BASE_FNS:
+        return BASE_FNS[activation_fn]
+    if activation_fn in CUSTOM_FNS:
+        return CUSTOM_FNS[activation_fn]
+    raise ValueError('unknown activation function "activation_fn" in config')

models/gpt2/gpt2.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""GPT-like model in Mesh-Tensorflow"""
+import tensorflow.compat.v1 as tf
+import mesh_tensorflow.transformer as mtf_transformer
+from models.utils import parse_inputs, entmax_cross_entropy_with_logits
+from models.layers import *
+# --------------------------------------------------------------------------------
+# TRANSFORMER BLOCK:
+def block(params, scope, layer_num, bias, sequence_dim, memory_length_dim, pos_emb, variable_dtype, context=None):
+    use_mlp_glu = params["mlp_glu"] == True
+    use_scale_norm = params["scalenorm"] == True
+    use_moe = exists(params["moe_layers"]) and (layer_num in params["moe_layers"])
+    use_rezero = params["rezero"] == True
+    macaron_attention = params["macaron"] == True
+    def fn(x):
+        with tf.variable_scope(scope):
+            nx = x.shape[-1]  # Grab last dimension from input
+            if use_rezero:
+                prenorm = identity
+            elif use_scale_norm:
+                prenorm = scale_norm
+            else:
+                prenorm = layer_norm
+            pre_residual_fn = rezero if use_rezero else identity
+            attention_type = params["attention_types"][layer_num]
+            if macaron_attention:
+                mult = 0.5
+                mlp_fn = mlp_glu if use_mlp_glu else mlp
+                intermediate_size = nx.size * 4 * (1 if not use_mlp_glu else 2)
+                # Define intermediate layer of mlp - to split
+                dim_intermediate_expanded = mtf.Dimension("intermediate_expanded", intermediate_size)
+                m = mlp_fn(x, "mlp_macaron", dim_intermediate_expanded, variable_dtype=variable_dtype, params=params)
+                x = x + (m * mult)
+            else:
+                mult = 1
+            if attention_type != "none":
+                res_x = prenorm(x, "norm_1", variable_dtype=variable_dtype, params=params)
+                a = attn(res_x, "attn", nx, attention_type=attention_type,
+                         params=params, bias=bias, dim_seq=sequence_dim, memory_length_dim=memory_length_dim,
+                         variable_dtype=variable_dtype, context=context, pos_emb=pos_emb)
+            else:
+                a = x
+            x = x + pre_residual_fn(a, "norm_rezero_1", dtype=variable_dtype)
+            res_x = prenorm(x, "norm_2", variable_dtype=variable_dtype, params=params)
+            if use_moe:
+                moe_params = mtf.transformer.moe.HParams()
+                mtf.transformer.moe.set_default_moe_hparams(moe_params)
+                moe_params.add_hparam("moe_min_expert_capacity", 1)
+                moe_params.add_hparam("moe_use_experts_attention", False)
+                # Override defaults
+                for k, v in params["moe_params"].items():
+                    moe_params.add_hparam(k, v)
+                moe_train = params["mode"] == "train"
+                m, aux_loss = mtf.transformer.moe.transformer_moe_layer_v1(res_x, x.shape[-1], moe_params,
+                                                                           train=moe_train,
+                                                                           mesh_shape=params["mesh_shape"],
+                                                                           layout=params["layout"],
+                                                                           activation=params.get("moe_activation",
+                                                                                                 "relu"),
+                                                                           variable_dtype=variable_dtype,
+                                                                           num_microbatches=params["num_microbatches"])
+                m = mtf.dropout(m, rate=params["res_dropout"], name="moe_dropout")
+            else:
+                mlp_fn = mlp_glu if use_mlp_glu else mlp
+                intermediate_size = nx.size * 4 * (1 if not use_mlp_glu else 2)
+                # Define intermediate layer of mlp - to split
+                dim_intermediate_expanded = mtf.Dimension("intermediate_expanded", intermediate_size)
+                m = mlp_fn(res_x, "mlp", dim_intermediate_expanded, variable_dtype=variable_dtype, params=params)
+                aux_loss = mtf.zeros(x.mesh, mtf.Shape([]), dtype=variable_dtype.slice_dtype)
+            x = x + pre_residual_fn((m * mult), "norm_rezero_2", variable_dtype)
+            return x, aux_loss
+    return fn
+# --------------------------------------------------------------------------------
+# GPT2 MODEL:
+def model(mtf_features, other_features, params, mesh, variable_dtype, context=None):
+    """A GPT style model implemented in mesh tensorflow."""
+    x, batch_dim, sequence_dim, embd_dim, vocab_dim, embed_sequence_dim = parse_inputs(mtf_features, other_features)
+    if is_incremental_inference(context):
+        # reshape inputs if in inference mode
+        x = mtf.gather(x, context.position - 1, sequence_dim)
+        x = mtf.reshape(x, [batch_dim])
+    use_axial_pos_emb = exists(params["axial_pos_emb"])
+    use_rotary_emb = exists(params["rotary_emb"])
+    # Text encoding
+    wte = mtf.get_variable(mesh, "wte", mtf.Shape([vocab_dim, embd_dim]),
+                           initializer=tf.random_normal_initializer(stddev=0.02),
+                           master_dtype=variable_dtype.master_dtype,
+                           slice_dtype=variable_dtype.slice_dtype,
+                           activation_dtype=variable_dtype.activation_dtype)
+    with tf.variable_scope("token_embd"):
+        # Text embedding
+        h = mtf.gather(wte, x, vocab_dim)
+        if params["embed_dropout"] > 0 and params["mode"] == "train":
+            h = mtf.dropout(h, rate=params["embed_dropout"], name="wte_dropout")
+    # Position encoding
+    if use_rotary_emb:
+        wpe = None
+        layer_pos_emb = rotary_positional_emb(mesh, sequence_dim, params, variable_dtype)
+    elif use_axial_pos_emb:
+        wpe = axial_positional_emb(embd_dim, mesh, params, variable_dtype)
+        layer_pos_emb = None
+    else:
+        # Use standard position encoding
+        wpe = mtf.get_variable(mesh, "wpe", mtf.Shape([embed_sequence_dim, embd_dim]),
+                               initializer=tf.random_normal_initializer(stddev=0.01),
+                               master_dtype=variable_dtype.master_dtype,
+                               slice_dtype=variable_dtype.slice_dtype,
+                               activation_dtype=variable_dtype.activation_dtype)
+        layer_pos_emb = None
+    if exists(wpe):
+        with tf.variable_scope("pos_embd"):
+            # Positional embedding
+            position_indices = mtf.range(mesh, sequence_dim, tf.int64) if not is_incremental_inference(context) else (
+                    context.position - 1)
+            pos_emb = mtf.gather(wpe, position_indices, wpe.shape[0])
+            if params["embed_dropout"] > 0 and params["mode"] == "train":
+                pos_emb = mtf.dropout(pos_emb, rate=params["embed_dropout"], name="wte_dropout")
+            h += pos_emb
+    aux_losses = 0  # instantiate auxiliary losses (for MOE models)
+    for layer in range(params["n_layer"]):
+        # attn blocks
+        share_parameters = exists(params["share_parameters"]) and params["share_parameters"] == True
+        block_scope = f"h{layer}" if not share_parameters else ""
+        block_fn = block(params=params, scope=block_scope, layer_num=layer,
+                         bias=other_features["attn_bias"],
+                         sequence_dim=sequence_dim,
+                         memory_length_dim=other_features["memory_length_dim"],
+                         pos_emb = layer_pos_emb,
+                         variable_dtype=variable_dtype,
+                         context=context)
+        # If true and in train mode, enable gradient checkpointing
+        recompute_grad = params["recompute_grad"] and (params["mode"] == "train") == True
+        h, loss = block_fn(h) if not recompute_grad else mtf.recompute_grad(block_fn, [h])
+        aux_losses += loss
+    no_weight_tie_emb = params["no_weight_tie"] == True
+    if no_weight_tie_emb:
+        with tf.variable_scope("wte_final_linear"):
+            logits = linear(h, "linear_out", vocab_dim, variable_dtype=variable_dtype, params=params)
+    else:
+        # Layer normalize & affine transform
+        h = layer_norm(h, "ln_f", variable_dtype=variable_dtype)
+        seq_dim = sequence_dim if not is_incremental_inference(context) else mtf.Dimension("sequence", 1)
+        with tf.variable_scope("wte_final_einsum"):
+            # Equivalent to tf.matmul
+            logits = mtf.einsum([h, wte], output_shape=[batch_dim, seq_dim, vocab_dim])
+    if params["mode"] in ["train", "eval"]:
+        labels = mtf_features["labels"]
+        z_loss = params.get("z_loss", 1e-4)  # an auxiliary loss used to stabilize mtf xentropy
+        # Go to full precision for the logits
+        logits = mtf.cast(logits, tf.float32)
+        use_entmax_loss = params.get("entmax_loss", False)
+        loss_fn = mtf.layers.softmax_cross_entropy_with_logits if not use_entmax_loss else entmax_cross_entropy_with_logits
+        with tf.variable_scope("xentropy_final"):
+            loss_batch = loss_fn(logits=logits, targets=labels,
+                                 vocab_dim=logits.shape[-1], z_loss=z_loss)
+        # For non-autoregressive models (masked language modeling training)
+        # Make sure labels with padding tokens are not counted in the loss
+        if not params["causal"]:
+            padding_id = params.get("padding_id", 0)
+            loss_batch = mtf.where(mtf.not_equal(labels, padding_id), loss_batch, mtf.zeros_like(loss_batch))
+        with tf.variable_scope("reduce_mean_final"):
+            loss = mtf.reduce_mean(loss_batch)
+        loss += aux_losses  # Add on auxiliary losses (currently only used for MoE)
+        loss /= params["num_microbatches"]
+        # Convert to train dtype
+        loss = mtf.cast(loss, variable_dtype.slice_dtype)
+    else:
+        loss = None
+        loss_batch = None
+    # Cast back to checkpoint dtype
+    logits = mtf.cast(logits, variable_dtype.master_dtype)
+    return logits, loss, loss_batch

models/layers.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import mesh_tensorflow as mtf
+import tensorflow.compat.v1 as tf
+import math
+import mesh_tensorflow.transformer as mtf_transformer
+from models.activations import get_activation_fn
+# --------------------------------------------------------------------------------
+# LAYERS:
+sentinel = object()
+def exists(x):
+    return x is not None
+def identity(x, *args, **kwargs):
+    return x
+def is_incremental_inference(context):
+    return exists(context) and context.mode == "incremental"
+def norm(x, axis, epsilon=1e-8):
+    x -= mtf.reduce_mean(x, reduced_dim=axis, name="norm_reduce_mean_u")
+    s = mtf.reduce_mean(mtf.square(x), reduced_dim=axis, name="norm_reduce_mean_s")
+    return x * mtf.rsqrt(s + epsilon)
+def rezero(x, scope, dtype):
+    with tf.variable_scope(scope):
+        g = mtf.get_variable(x.mesh, "g", [], initializer=tf.constant_initializer(0), dtype=dtype)
+        return x * g
+def scale_norm(x, scope, *, variable_dtype, axis=sentinel, epsilon=1e-5, params=None):
+    if axis is sentinel:
+        axis = x.shape[-1]
+    with tf.variable_scope(scope):
+        g = mtf.get_variable(x.mesh, "g", [], initializer=tf.constant_initializer(1),
+                             master_dtype=variable_dtype.master_dtype,
+                             slice_dtype=variable_dtype.slice_dtype,
+                             activation_dtype=variable_dtype.activation_dtype)
+        x = norm(x, axis, epsilon)
+        x = x * g
+        return x
+def layer_norm(x, scope, *, variable_dtype, axis=sentinel, epsilon=1e-5, params=None):
+    """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
+    if axis is sentinel:
+        axis = x.shape[-1]
+    with tf.variable_scope(scope):
+        n_state = x.shape[-1]
+        g = mtf.get_variable(x.mesh, "g", [n_state], initializer=tf.constant_initializer(1),
+                             master_dtype=variable_dtype.master_dtype,
+                             slice_dtype=variable_dtype.slice_dtype,
+                             activation_dtype=variable_dtype.activation_dtype)
+        b = mtf.get_variable(x.mesh, "b", [n_state], initializer=tf.constant_initializer(0),
+                             master_dtype=variable_dtype.master_dtype,
+                             slice_dtype=variable_dtype.slice_dtype,
+                             activation_dtype=variable_dtype.activation_dtype)
+        x = norm(x, axis, epsilon)
+        x = x * g + b
+        return x
+def linear_attention(q, k, v):
+    batch_dim, seq_dim, head_dim, dim_out = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
+    q = mtf.rename_dimension(q, "features_per_head", "features_per_head_in")
+    k = mtf.rename_dimension(k, "features_per_head", "features_per_head_in")
+    dim_in = k.shape[-1]
+    q = mtf.softmax(q, dim_in)
+    k = mtf.softmax(k, seq_dim)
+    context = mtf.einsum([k, v], output_shape=[batch_dim, head_dim, dim_in, dim_out])
+    attn = mtf.einsum([q, context], output_shape=[batch_dim, seq_dim, head_dim, dim_out])
+    return attn
+def causal_linear_attention(q, k, v, eps = 1e-6):
+    batch_dim, seq_dim, head_dim, dim_out = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
+    q = mtf.rename_dimension(q, "features_per_head", "features_per_head_in")
+    k = mtf.rename_dimension(k, "features_per_head", "features_per_head_in")
+    dim_in = k.shape[-1]
+    q = mtf.softmax(q, dim_in)
+    k = mtf.exp(k)
+    cumulative_k = mtf.cumsum(k, seq_dim) + eps
+    D_inv = 1. / mtf.einsum([q, cumulative_k], output_shape=[batch_dim, seq_dim, head_dim])
+    context = mtf.einsum([k, v], output_shape=[batch_dim, seq_dim, head_dim, dim_in, dim_out])
+    cumulative_context = mtf.cumsum(context, seq_dim)
+    attn = mtf.einsum([q, cumulative_context, D_inv], output_shape=[batch_dim, seq_dim, head_dim, dim_out])
+    return attn
+def linear(x, scope, nf, *, w_init_stdev=0.02, variable_dtype, params=None, scale=False):
+    # nf = number of features
+    if params["scale_by_depth"] and scale:
+        # Scale by sqrt(num_layers), only happens at the final projection before a res block output
+        w_init_stdev = w_init_stdev * (1. / math.sqrt(params["n_layer"]))
+    if params["scale_by_in"]:  # Scale by sqrt(num_input_features)
+        w_init_stdev = w_init_stdev * (1. / math.sqrt(x.shape[-1].size))  # Dimension is a namedtuple of (name, size)
+    # Not in the variable_scope because mtf already has a variable_scope in it
+    with tf.variable_scope("conv1d_main"):
+        c = mtf.layers.dense(x, new_dims=[nf], reduced_dims=[x.shape[-1]], name=scope, use_bias=True,
+                             kernel_initializer=tf.random_normal_initializer(stddev=w_init_stdev),
+                             variable_dtype=variable_dtype,
+                             )
+        return c
+def memory_key_values(k, v, num_mem_kv, dim_batch, dim_heads, variable_dtype, mesh):
+    """memory / key values from all attention paper"""
+    dim_mem_kv = mtf.Dimension("mem_kv_sequence", num_mem_kv)
+    emb_dim = k.shape[-1]
+    mem_std = 1 / math.sqrt(emb_dim.size)
+    mem_k = mtf.get_variable(mesh, "mem_k", mtf.Shape([dim_mem_kv, dim_heads, emb_dim]),
+                             initializer=tf.random_normal_initializer(stddev=mem_std),
+                             master_dtype=variable_dtype.master_dtype,
+                             slice_dtype=variable_dtype.slice_dtype,
+                             activation_dtype=variable_dtype.activation_dtype,
+                             )
+    mem_v = mtf.get_variable(mesh, "mem_v", mtf.Shape([dim_mem_kv, dim_heads, emb_dim]),
+                             initializer=tf.random_normal_initializer(stddev=mem_std),
+                             master_dtype=variable_dtype.master_dtype,
+                             slice_dtype=variable_dtype.slice_dtype,
+                             activation_dtype=variable_dtype.activation_dtype)
+    mem_k, mem_v = map(lambda t: mtf.broadcast(t, [dim_batch, dim_mem_kv, dim_heads, emb_dim]),
+                       (mem_k, mem_v))
+    mem_k, mem_v = map(lambda t: mtf.rename_dimension(t, "mem_kv_sequence", "sequence"),
+                       (mem_k, mem_v))
+    k = mtf.concat([mem_k, k], "sequence")
+    v = mtf.concat([mem_v, v], "sequence")
+    return k, v
+def attn(x, scope, n_state, *, attention_type, params, bias, dim_seq, memory_length_dim, variable_dtype, context=None, pos_emb=None):
+    # x :: [batch, seq, n_embd]
+    x_shape, dim_batch, *_, dim_embd, mesh = x.shape, *x.shape, x.mesh
+    # n_state is the same as config["n_embd"], which is also the same as dim_embd.
+    assert n_state.size % params["n_head"] == 0
+    dim_heads = mtf.Dimension("heads", params["n_head"])
+    num_mem_kv = params.get("num_mem_kv", 0)
+    use_num_mem_kv = num_mem_kv > 0
+    with tf.variable_scope(scope):
+        # Compute attention inputs
+        dim_kv = mtf.Dimension("features_per_head", params["n_embd"] // params["n_head"])
+        mtfparams = mtf.transformer.attention.attention_params_simple(
+            x.mesh,
+            io_dim=dim_embd,
+            kv_dim=dim_kv,
+            heads_dim=dim_heads,
+            variable_dtype=variable_dtype
+        )
+        q = mtfparams.compute_q(x)
+        k = mtfparams.compute_k(x)
+        v = mtfparams.compute_v(x)
+        if is_incremental_inference(context):
+            one_hot = mtf.one_hot(context.position - 1, dim_seq, dtype=variable_dtype.master_dtype)
+            inv_one_hot = 1.0 - one_hot
+            old_k, old_v = context.get_states(2)
+            k = old_k * inv_one_hot + k * one_hot
+            v = old_v * inv_one_hot + v * one_hot
+        if exists(context):
+            context.record_new_states([k, v])
+        if exists(pos_emb):
+            cos, sin = pos_emb
+            k = apply_rotary_emb(k, cos, sin)
+            if is_incremental_inference(context):
+                seq_dim = cos.shape.get_dim_by_name('sequence')
+                cos = mtf.gather(cos, context.position - 1, seq_dim)
+                sin = mtf.gather(sin, context.position - 1, seq_dim)
+            q = apply_rotary_emb(q, cos, sin)
+        with tf.variable_scope("attention"):
+            if attention_type == "local":
+                # `local_attention_1d` has built in autoregressive masking, so we don't need mask_attn_weights.
+                radius = params.get("local_attention_radius", 256)
+                if is_incremental_inference(context):
+                    q *= one_hot
+                a = mtf_transformer.attention.local_attention_1d(
+                    q, k, v,
+                    length_dim=k.shape[1],
+                    key_dim=dim_kv,
+                    value_dim=dim_kv,
+                    radius=radius,
+                    length_dim_num_splits=1,
+                    fully_autoregressive=params["causal"],
+                    attention_kwargs={},
+                )
+                if is_incremental_inference(context):
+                    a = mtf.gather(a, context.position - 1, dim_seq)
+            elif attention_type == "global":
+                # TODO: pass in fake context
+                # Broadcast mask bias across batch and heads
+                if exists(bias):
+                    if not is_incremental_inference(context):
+                        broadcasted_bias = mtf.broadcast(bias, [dim_batch, dim_heads, bias.shape[-2], bias.shape[-1]])
+                    else:
+                        # In the incremental case, a custom mask needs to be built that masks out all key/values that are greater than the current position
+                        bias = mtf.gather(bias, context.position - 1, dim_seq)
+                        broadcasted_bias = mtf.broadcast(bias, [dim_batch, dim_heads, bias.shape[-1]])
+                # memory key / values, from all-attention paper
+                if use_num_mem_kv:
+                    k, v = memory_key_values(k, v, num_mem_kv, dim_batch, dim_heads, variable_dtype, mesh)
+                k = mtf.replace_dimensions(k, k.shape[1], memory_length_dim)
+                v = mtf.replace_dimensions(v, v.shape[1], memory_length_dim)
+                attn_dropout_rate = params["attn_dropout"] if params["mode"] == "train" else 0
+                a = mtf_transformer.attention.attention(
+                    q, k, v,
+                    memory_length_dim=memory_length_dim,
+                    key_dim=dim_kv,
+                    value_dim=dim_kv,
+                    bias=broadcasted_bias,
+                    dropout_rate=attn_dropout_rate
+                )
+            elif attention_type == "linear":
+                linear_attn_fn = causal_linear_attention if params["causal"] else linear_attention
+                a = linear_attn_fn(q, k, v)
+            else:
+                raise NotImplementedError("Unknown attention type {}!".format(attention_type))
+        with tf.variable_scope("compute_output"):
+            a = mtfparams.compute_output(a, x_shape)
+        with tf.variable_scope("compute_output_bias"):
+            b = mtf.get_variable(x.mesh, "o_b", [dim_embd], initializer=tf.constant_initializer(0),
+                                 master_dtype=variable_dtype.master_dtype,
+                                 slice_dtype=variable_dtype.slice_dtype,
+                                 activation_dtype=variable_dtype.activation_dtype)
+            a += b
+        if params["mode"] == "train" and params["res_dropout"] > 0:
+            a = mtf.dropout(a, rate=params["res_dropout"], name="res_dropout")
+        return a
+def mlp(x, scope, n_state, *, variable_dtype, params):
+    activation_fn = get_activation_fn(params)
+    with tf.variable_scope(scope):
+        nx = x.shape[-1]
+        h = activation_fn(linear(x, "c_fc", n_state, variable_dtype=variable_dtype, params=params))
+        h2 = linear(h, "c_proj", nx, variable_dtype=variable_dtype, params=params, scale=True)
+        if params["mode"] == "train" and params["res_dropout"] > 0:
+            h2 = mtf.dropout(h2, rate=params["res_dropout"], name="mlp_dropout")
+        return h2
+def mlp_glu(x, scope, n_state, *, variable_dtype, params):
+    activation_fn = get_activation_fn(params)
+    with tf.variable_scope(scope):
+        nx = x.shape[-1]
+        h = linear(x, "c_fc", n_state, params=params)
+        h, gate = mtf.split(h, h.shape[-1], 2)
+        h *= activation_fn(gate)
+        h2 = linear(h, "c_proj", nx, variable_dtype=variable_dtype, params=params, scale=True)
+        if params["mode"] == "train" and params["res_dropout"] > 0:
+            h2 = mtf.dropout(h2, rate=params["res_dropout"], name="mlp_dropout")
+        return h2
+def axial_positional_emb(embd_dim, mesh, params, variable_dtype):
+    # Use axial position encoding
+    axial_dim_1, axial_dim_2 = params["axial_pos_emb"]
+    axial_dim = mtf.Dimension("axial_dim", axial_dim_1 * axial_dim_2)
+    dim_axials = [mtf.Dimension(f"axial_dim_{i}", t) for i, t in enumerate((axial_dim_1, axial_dim_2))]
+    axial_wpe_1 = mtf.get_variable(mesh, "axial_wpe_1", mtf.Shape([dim_axials[0], embd_dim]),
+                                   initializer=tf.random_normal_initializer(stddev=0.01),
+                                   master_dtype=variable_dtype.master_dtype,
+                                   slice_dtype=variable_dtype.slice_dtype,
+                                   activation_dtype=variable_dtype.activation_dtype)
+    axial_wpe_2 = mtf.get_variable(mesh, "axial_wpe_2", mtf.Shape([dim_axials[1], embd_dim]),
+                                   initializer=tf.random_normal_initializer(stddev=0.01),
+                                   master_dtype=variable_dtype.master_dtype,
+                                   slice_dtype=variable_dtype.slice_dtype,
+                                   activation_dtype=variable_dtype.activation_dtype)
+    axial_wpe_1, axial_wpe_2 = map(lambda t: mtf.broadcast(t, [dim_axials[0], dim_axials[1], embd_dim]),
+                                   (axial_wpe_1, axial_wpe_2))
+    wpe = (axial_wpe_1 + axial_wpe_2) / 2
+    wpe = mtf.reshape(wpe, [axial_dim, embd_dim])
+    return wpe
+def rotary_positional_emb(mesh, sequence_dim, params, variable_dtype):
+    dtype = variable_dtype.master_dtype
+    dim_head = params["n_embd"] // params["n_head"]
+    dim_head = mtf.Dimension("features_per_head", dim_head)
+    half_dim_head = mtf.Dimension("half_features_per_head", dim_head.size // 2)
+    dim_range = mtf.range(mesh, half_dim_head, dtype) * 2 / dim_head.size
+    half_freqs = 1. / mtf.pow(mtf.constant(mesh, 10000, dtype = dtype), dim_range)
+    seq = mtf.range(mesh, sequence_dim, dtype)
+    half_freqs = mtf.einsum([half_freqs, seq], [sequence_dim, half_dim_head])
+    freqs = mtf.concat((half_freqs, half_freqs), half_dim_head.name)
+    freqs = mtf.rename_dimension(freqs, half_dim_head.name, dim_head.name)
+    return mtf.cos(freqs), mtf.sin(freqs)
+def rotate_half(x):
+    dim_head_name = "features_per_head"
+    dim_head = x.shape.get_dim_by_name(dim_head_name)
+    half_dim_head_size = dim_head.size // 2
+    x1 = mtf.slice(x, 0, half_dim_head_size, dim_head_name)
+    x2 = mtf.slice(x, half_dim_head_size, half_dim_head_size, dim_head_name)
+    return mtf.concat((-x2, x1), dim_head.name)
+def apply_rotary_emb(x, cos, sin):
+    rotated_x = rotate_half(x)
+    return x * cos + rotated_x * sin

models/utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import tensorflow as tf
+import mesh_tensorflow as mtf
+from functools import partial
+def entmax_backward(explicit_inputs, all_inputs, forward_operations, outputs, output_grads, alpha=1.3, dim=None,
+                    n_iter=50):
+    x, = explicit_inputs
+    y, = outputs
+    dY, = output_grads
+    gppr = mtf.where(mtf.greater(y, 0), mtf.pow(y, (2 - alpha)), mtf.zeros_like(y))
+    dX = dY * gppr
+    q = mtf.reduce_sum(dX, reduced_dim=dim) / mtf.reduce_sum(gppr, reduced_dim=dim)
+    dX = dX - q * gppr
+    return dX,
+def entmax_forward(x, alpha=1.3, dim=None, n_iter=50):
+    assert alpha > 1 and alpha < 2, 'alpha must be between 1 and 2'
+    _gp = lambda x, alpha: x ** (alpha - 1)
+    _gp_inv = lambda x, alpha: mtf.pow(x, (1 / (alpha - 1)))
+    _p = lambda x, alpha: _gp_inv(mtf.relu(x), alpha)
+    dim = x.shape[-1] if dim is None else dim
+    d = dim.size
+    x = x * (alpha - 1)
+    max_val = mtf.reduce_max(x, reduced_dim=dim)
+    tau_lo = max_val - _gp(1, alpha)
+    tau_hi = max_val - _gp(1 / d, alpha)
+    f_lo = mtf.reduce_sum(_p(x - tau_lo, alpha), reduced_dim=dim) - 1
+    dm = tau_hi - tau_lo
+    for _ in range(n_iter):
+        dm = dm / 2
+        tau_m = tau_lo + dm
+        p_m = _p(x - tau_m, alpha)
+        f_m = mtf.reduce_sum(p_m, reduced_dim=dim) - 1
+        mask = mtf.greater_equal((f_m * f_lo), 0)
+        tau_lo = mtf.where(mask, tau_m, tau_lo)
+    p_m = p_m / mtf.reduce_sum(p_m, reduced_dim=dim)
+    return p_m
+def entmax(x, alpha=1.3, dim=None, n_iter=50):
+    kwargs = dict(alpha=alpha, dim=dim, n_iter=n_iter)
+    return mtf.custom_gradient(
+        partial(entmax_forward, **kwargs),
+        partial(entmax_backward, **kwargs),
+        [x]
+    )
+def entmax_cross_entropy_with_logits(logits, targets, vocab_dim, z_loss=0.0):
+    if targets.dtype.is_integer:
+        # hard targets
+        if (set(targets.shape.dims) != set(logits.shape.dims).difference([vocab_dim])):
+            raise ValueError(
+                "softmax_cross_entropy_with_logits with hard targets "
+                "dims in targets=%s should be dims in logits=%s other than "
+                "vocab_dim=%s" % (targets, logits, vocab_dim))
+        targets = mtf.one_hot(targets, vocab_dim, dtype=logits.dtype)
+    elif set(targets.shape.dims) != set(logits.shape.dims):
+        raise ValueError(
+            "softmax_cross_entropy_with_logits with soft targets "
+            "dims in targets=%s should be dims in logits=%s" % (targets, logits))
+    if vocab_dim not in logits.shape.dims:
+        raise ValueError("vocab_dim must be in logits.shape.dims")
+    log_entmax = mtf.log(entmax(logits, dim=vocab_dim))
+    loss = mtf.negative(
+        mtf.reduce_sum(log_entmax * targets, reduced_dim=vocab_dim))
+    return loss
+def sample_categorical(x, dim=None):
+    dim = x.shape[-1] if dim is None else dim
+    cdf = mtf.cumsum(x, dim)
+    rand_uniform = mtf.random_uniform(x.mesh, x.shape - dim, minval=0, maxval=1)
+    mask = mtf.cast(mtf.greater(cdf, rand_uniform), tf.int32)
+    return mtf.argmax(mask, dim)
+def biasmask_attn_weights(mesh, nd, ns, variable_dtype):
+    # The old mask_attn_weights applied directly to the QK;
+    # this returns a bias that the attention code from mtf adds to the attention matrix.
+    # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
+    # n_src and n_dest are both the same, i.e equal to sequence length
+    # We rename ns because we want bias to have shape [batch, heads, memory_length, sequence] to match up with QK^T
+    # Information flows from k and v (memory_length) to q (sequence)
+    i = mtf.range(mesh, nd, tf.int32) + ns.size - nd.size
+    j = mtf.range(mesh, ns, tf.int32)
+    i, j = map(lambda t: mtf.broadcast(t, [nd, ns]), (i, j))
+    dtype = variable_dtype.activation_dtype
+    return mtf.cast(mtf.less(i, j), dtype) * -1e10
+def parse_inputs(mtf_features, other_features):
+    # Parse inputs and labels from the mtf_features / other_features input dicts
+    # All dimensions are defined inside model_fn for efficiency
+    x = mtf_features["inputs"]
+    batch_dim = x.shape[0]
+    sequence_dim = x.shape[1]
+    embd_dim = other_features["embd_dim"]
+    vocab_dim = other_features["vocab_dim"]
+    embed_sequence_dim = other_features["embed_sequence_dim"]
+    return x, batch_dim, sequence_dim, embd_dim, vocab_dim, embed_sequence_dim

optimizers.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import re
+import mesh_tensorflow as mtf
+import tensorflow.compat.v1 as tf
+def clip_by_global_norm(grads, clip_norm):
+    """Clip the grads by global norm."""
+    global_norm = mtf.sqrt(mtf.add_n([mtf.reduce_sum(mtf.square(t)) for t in grads if t is not None]))
+    multiplier = clip_norm / mtf.maximum(global_norm, clip_norm)
+    clipped_grads = [None if t is None else t * multiplier for t in grads]
+    return clipped_grads, global_norm
+def get_optimizer(mesh, loss, params, variable_dtype, inp_var_grads=None):
+    """Creates and returns an optimizer training op."""
+    global_step = tf.train.get_or_create_global_step()
+    learning_rate = tf.constant(value=params["lr"], shape=[], dtype=variable_dtype.slice_dtype)
+    clip_value = mtf.constant(mesh, params["gradient_clipping"], dtype=variable_dtype.slice_dtype)
+    if inp_var_grads is None:
+        var_grads = mtf.gradients([loss], [v.outputs[0] for v in mesh.graph.trainable_variables])
+    else:
+        var_grads = inp_var_grads
+    # Cast to full precision
+    var_grads_fp = [mtf.cast(v, variable_dtype.slice_dtype) for v in var_grads]
+    # decrease LR to final lr (lr*0.1) by this step - defaults to train_steps
+    end_step = params.get("lr_decay_end", params["train_steps"])
+    if params["lr_decay"] == "linear":
+        learning_rate = tf.train.polynomial_decay(
+            learning_rate,
+            global_step,
+            end_step,
+            end_learning_rate=params["lr"]*0.1, # Decrease to 10% of initial LR according to GPT-3 paper
+            power=1.0,
+            cycle=False)
+    elif params["lr_decay"] == "cosine":
+        learning_rate = tf.train.cosine_decay(
+            learning_rate,
+            global_step,
+            end_step,
+            alpha=0.1  # Alpha is min lr value as a fraction of init lr.
+        )
+    if params["warmup_steps"] > 0:
+        global_steps_int = tf.cast(global_step, tf.int32)
+        warmup_steps_int = tf.constant(params["warmup_steps"], dtype=tf.int32)
+        dtype = variable_dtype.slice_dtype
+        global_steps_float = tf.cast(global_steps_int, dtype)
+        warmup_steps_float = tf.cast(warmup_steps_int, dtype)
+        warmup_percent_done = global_steps_float / warmup_steps_float
+        warmup_learning_rate = learning_rate * warmup_percent_done
+        is_warmup = tf.cast(global_steps_int < warmup_steps_int, dtype)
+        learning_rate = ((1.0 - is_warmup) * learning_rate +
+                       is_warmup * warmup_learning_rate)
+    learning_rate = mtf.import_fully_replicated(mesh, learning_rate, mtf.Shape([]), name="learning_rate")
+    mtf.scalar_summary("lr", learning_rate)
+    if params["opt_name"].lower() == "adam":
+        optimizer = AdamWeightDecayOptimizer(
+            learning_rate=learning_rate,
+            weight_decay_rate=params["weight_decay"],
+            beta_1=params["beta1"],
+            beta_2=params["beta2"],
+            epsilon=params["epsilon"],
+            exclude_from_weight_decay=["norm", "bias"],
+            variable_dtype=variable_dtype
+        )
+    else:
+        optimizer = mtf.optimize.AdafactorOptimizer(
+            learning_rate=params["lr"],
+            decay_rate=params["weight_decay"],
+            beta1=params["beta1"],
+            epsilon1=params["ada_epsilon1"],
+            epsilon2=params["ada_epsilon2"]
+        )
+    if params["gradient_clipping"] is not None:
+        (var_grads_fp, _) = clip_by_global_norm(var_grads_fp, clip_norm=clip_value)
+    update_ops = optimizer.apply_grads(var_grads_fp, mesh.graph.trainable_variables)
+    return learning_rate, update_ops, var_grads_fp
+class AdamWeightDecayOptimizer(mtf.optimize.Optimizer):
+  """A basic Adam optimizer that includes "correct" L2 weight decay."""
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-6,
+               exclude_from_weight_decay=None,
+               variable_dtype=None):
+    """Constructs a AdamWeightDecayOptimizer."""
+    self.learning_rate = learning_rate
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    self.variable_dtype = variable_dtype
+  def apply_grad(self, grad, var):
+    """See base class."""
+    if grad is None:
+      tf.logging.warning("Gradient is None for variable %s" % var.name)
+      return []
+    grad = mtf.to_float(grad)
+    assignments = []
+    m = mtf.get_variable(
+        var.mesh, var.name + "/adam_m", var.shape,
+        initializer=tf.zeros_initializer(),
+        # master_dtype=self.variable_dtype.master_dtype,
+        # slice_dtype=self.variable_dtype.slice_dtype,
+        # activation_dtype=self.variable_dtype.activation_dtype,
+        trainable=False)
+    v = mtf.get_variable(
+        var.mesh, var.name + "/adam_v", var.shape,
+        initializer=tf.zeros_initializer(),
+        # master_dtype=self.variable_dtype.master_dtype,
+        # slice_dtype=self.variable_dtype.slice_dtype,
+        # activation_dtype=self.variable_dtype.activation_dtype,
+        trainable=False)
+    # Standard Adam update.
+    next_m = self.beta_1 * m + (1.0 - self.beta_1) * grad
+    next_v = self.beta_2 * v + (1.0 - self.beta_2) * mtf.square(grad)
+    update = next_m / (mtf.sqrt(next_v) + self.epsilon)
+    # Just adding the square of the weights to the loss function is *not*
+    # the correct way of using L2 regularization/weight decay with Adam,
+    # since that will interact with the m and v parameters in strange ways.
+    #
+    # Instead we want to decay the weights in a manner that doesn't interact
+    # with the m/v parameters. This is equivalent to adding the square
+    # of the weights to the loss with plain (non-momentum) SGD.
+    if self._do_use_weight_decay(var.name):
+      update += mtf.to_float(var.value) * self.weight_decay_rate
+    update_with_lr = self.learning_rate * update
+    var_update = mtf.assign_sub(var, update_with_lr)
+    assignments.extend(
+        [var_update,
+         mtf.assign(m, next_m),
+         mtf.assign(v, next_v)])
+    return assignments
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+google-api-python-client
+jsonlines
+lm_dataformat
+mesh-tensorflow==0.1.18
+numpy
+oauth2client
+ortools
+pytest
+sacred
+tensorflow==2.5.0
+tensorflow-datasets==3.2.1
+tokenizers==0.9.4
+transformers==4.1.1
+tpunicorn
+absl-py
+ftfy
+sacred
+pymongo

run_experiment.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import atexit
+import sacred
+import argparse
+import time
+import math
+import subprocess
+import shutil
+import os
+import json
+import threading
+import requests
+import glob
+from configs import fetch_model_params
+import socket
+import subprocess
+import queue
+import sys
+import signal
+parser = argparse.ArgumentParser()
+parser.add_argument('--tpu', type=str, required=True) # Name of TPU to train on, if any
+parser.add_argument('--model', type=str, required=True) # JSON file that contains model parameters
+parser.add_argument('--experiment_name', type=str, required=True) # name of experiment (will show up in omniboard)
+parser.add_argument('--steps_per_checkpoint', type=int, default=5000)
+parser.add_argument('--autostack', action="store_false")
+parser.add_argument('--auto_layout', action="store_true")
+parser.add_argument('--auto_layout_and_mesh_shape', action="store_true")
+parser.add_argument('--new', action='store_true')
+parser.add_argument('--test', action='store_true')
+parser.add_argument('--eval', action='store_true')
+parser.add_argument('--predict', action='store_true')
+parser.add_argument('--no_delete_tpu', action='store_true')
+parser.add_argument('--initial_heartbeat_timeout', type=int, default=7200)
+parser.add_argument('--heartbeat_timeout', type=int, default=1800) # kill and restart if nothing logged to tensorboard in this many seconds
+args = parser.parse_args()
+params = fetch_model_params(args.model)
+ex = sacred.Experiment(args.experiment_name)
+ex.observers.append(sacred.observers.QueuedMongoObserver(url='127.0.0.1:27017', db_name='db', username='user', password='password'))
+def get_open_port(lo=8000, hi=8100):
+    for i in range(lo, hi):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            if s.connect_ex(('localhost', i)) != 0:
+                return i
+def train_thread(args, tpu, id, q):
+    print('starting training on', tpu)
+    # pass binary flags through
+    opts = ''
+    for flag in ['auto_layout', 'auto_layout_and_mesh_shape', 'new', 'test', 'predict', 'eval', ]:
+        if args.__getattribute__(flag):
+            opts += ' --' + flag
+    for flag in ['autostack', ]:
+        if not args.__getattribute__(flag):
+            opts += ' --' + flag
+    cmd = "python3 main.py --tpu {tpu} --model run_configs/config_{id}.json --steps_per_checkpoint {steps_per_checkpoint} {opts} --sacred_id {run_id}".format(tpu=tpu, id=id, steps_per_checkpoint=args.steps_per_checkpoint, opts=opts, run_id=id)
+    print('Running:', cmd)
+    proc = subprocess.Popen(cmd, shell=True)
+    # poll until it's exited
+    while proc.poll() is None:
+        time.sleep(60)
+        try:
+            nq, *nargs = q.get_nowait()
+            if nq == 'kill':
+                print('train thread recieved kill signal from logging thread')
+                # first send SIGTERM
+                proc.terminate()
+                time.sleep(60)
+                # if it still hasn't exited, we send SIGKILL
+                if proc.poll() is None:
+                    print('SIGTERM not successful, sending SIGKILL')
+                    proc.kill()
+        except queue.Empty:
+            pass
+    print('exited training!')
+    if proc.returncode == 0:
+        print('exited gracefully')
+        os.kill(os.getpid(), signal.SIGINT)
+        return
+    if args.no_delete_tpu:
+        print('recreate done, exiting train_thread - not killing tpu!')
+        return
+    print("Recreating {} in 60sec...".format(tpu))
+    time.sleep(60)
+    os.system("pu recreate {} --yes --retry 3600 --retry-randomness 1.5".format(tpu))
+    print('recreate done, exiting train_thread')
+    # clear out queue
+    while True:
+        try:
+            q.get_nowait()
+            print('dropped request in queue after pu recreate')
+        except queue.Empty:
+            break
+def get_json(uri, params=None, timeout=15):
+    resp = requests.get(uri, params=params, timeout=timeout)
+    resp.raise_for_status()
+    return resp.json()
+def get_tag_sets(base_uri):
+    j = get_json(f'{base_uri}/data/plugin/scalars/tags', {'experiment': ''})
+    assert isinstance(j, dict)
+    return {
+        run: j[run].keys()
+        for run in j.keys()
+    }
+def get_scalar_data(base_uri, run, tag):
+    j = get_json(f'{base_uri}/data/plugin/scalars/scalars', {'experiment': '', 'run': run, 'tag': tag})
+    assert isinstance(j, list)
+    return j
+def get_run_data(port):
+    base_uri = f'http://localhost:{port}/'
+    r = {}
+    try:
+        tag_sets = get_tag_sets(base_uri)
+        runs = tag_sets.keys()
+        if '.' in runs:
+            if 'loss' in tag_sets['.']:
+                r['loss'] = get_scalar_data(base_uri, '.', 'loss')
+        if 'eval' in runs:
+            if 'loss' in tag_sets['eval']:
+                r['val_loss'] = get_scalar_data(base_uri, 'eval', 'loss')
+        if 'eval_lambada' in runs:
+            if 'lambada_acc' in tag_sets['eval_lambada']:
+                r['lambada_acc'] = get_scalar_data(base_uri, 'eval_lambada', 'lambada_acc')
+            if 'lambada_log_ppl' in tag_sets['eval_lambada']:
+                r['lambada_ppl'] = [
+                    [t, s, math.exp(lp)]
+                    for [t, s, lp] in get_scalar_data(base_uri, 'eval_lambada', 'lambada_log_ppl')
+                ]
+    except:
+        import traceback
+        traceback.print_exc()
+    return r
+@ex.main
+def main(_run):
+    print('Starting run', _run._id)
+    print('experiment main invoked with argv:', " ".join(sys.argv))
+    print('WARNING: please remember to remove old metric log files from the model directory.')
+    os.makedirs('run_configs', exist_ok=True)
+    shutil.copy(args.model if args.model.endswith('.json') else 'configs/{}.json'.format(args.model), 'run_configs/config_{}.json'.format(_run._id))
+    tensorboard_port = get_open_port()
+    print('Tensorboard at port:', tensorboard_port)
+    print('Tensorboard url: ', 'http://eleutherai.bmk.sh:'+ str(tensorboard_port))
+    os.system("screen -S tensorboard_{} -d -m bash -c 'tensorboard --logdir {} --port {} --bind_all --reload_multifile=true || tensorboard --logdir {} --port {} --reload_multifile=true'".format(_run._id, params["model_path"], tensorboard_port,params["model_path"], tensorboard_port,))
+    atexit.register(goodbye, _run._id)
+    curr_step = {}
+    seen_predictions = set()
+    heartbeat_timeout = args.initial_heartbeat_timeout * 2
+    while True:
+        last_tb_log_time = time.time()
+        start_time = time.time()
+        q = queue.Queue()
+        trainthd = threading.Thread(target=train_thread, args=(args, args.tpu, _run._id, q))
+        trainthd.start()
+        while trainthd.is_alive():
+            time.sleep(60)
+            if start_time + args.initial_heartbeat_timeout < time.time():
+                # after initial args.initial_heartbeat_timeout grace period, now we want to set the timeout threshold much lower
+                heartbeat_timeout = args.heartbeat_timeout
+            print('Polling tensorboard for metrics...')
+            data = get_run_data(tensorboard_port)
+            for k in data.keys():
+                for ts, step, val in data[k]:
+                    if step <= curr_step.get(k, -1):
+                        continue
+                    _run.log_scalar(k, val, step)
+                    if k == 'loss':
+                        _run.log_scalar('tb_ts', ts, step)
+                        print('Logged to sacred: step={},loss={},tb_ts={}'.format(step, val, ts))
+                    # found something new, so logging!
+                    last_tb_log_time = time.time()
+                    curr_step[k] = step
+            for f in glob.glob('predictions_{}_*'.format(_run._id)):
+                if f in seen_predictions:
+                    continue
+                print('collecting prediction file', f)
+                ex.add_artifact(f)
+                seen_predictions.add(f)
+            # collect eval metrics from jsonl
+            if os.path.exists(f'eval_{_run._id}.jsonl'):
+                with open(f'eval_{_run._id}.jsonl') as fh:
+                    for line in fh:
+                        ob = json.loads(line)
+                        val_step = ob['global_step']
+                        val_task = ob['task']
+                        for metr in ob.keys():
+                            k = 'fs.' + val_task + '.' + metr
+                            if metr in ['task', 'global_step']: continue
+                            if val_step <= curr_step.get(k, -1): continue
+                            _run.log_scalar(k, ob[metr], val_step)
+                            curr_step[k] = val_step
+            if time.time() - last_tb_log_time > heartbeat_timeout:
+                # the run hasn't logged in a while, so we restart it
+                q.put(('kill',))
+                # give training thread some time to do its thing and recreate tpu
+                while trainthd.is_alive():
+                    print('logging thread waiting for killing stalled run and for tpu recreate to finish')
+                    time.sleep(60)
+                # reset heartbeat timeout to initial
+                heartbeat_timeout = args.initial_heartbeat_timeout
+                last_tb_log_time = time.time()
+        if args.no_delete_tpu:
+            break
+def goodbye(id):
+    print("You are now leaving the Python sector.")
+    print("Sie verlassen den pythonischen Sektor.")
+    os.system("screen -S tensorboard_{} -X quit".format(id))
+if __name__ == '__main__':
+    for file in glob.glob("**/*", recursive=True):
+        if file.split('.')[-1] in ['py']:
+            print('Adding', file, 'to sacred')
+            ex.add_source_file(file)
+    ex.add_config({
+        'tpu_name': args.tpu,
+        **params
+    })
+    ex.run()

sample.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import mesh_tensorflow as mtf
+import tensorflow.compat.v1 as tf
+import mesh_tensorflow.transformer as mtf_transformer
+from models.utils import entmax, sample_categorical
+from models.gpt2 import gpt2
+def sample_autoregressive(partial_sequences,
+                          other_features,
+                          params,
+                          stop_at_token=50256,
+                          max_steps=None,
+                          temperature=0.9,
+                          variable_dtype=mtf.VariableDType(tf.float32),
+                          encoder_output=None,
+                          encoder_sequence_id=None,
+                          encoder_inputs=None,
+                          shared_params=None,
+                          has_partial_sequences=True,
+                          encoder_layer_outputs=None,
+                          never_end=False,
+                          remove_partial_sequences=False,
+                          sampling_keep_top_k=-1,
+                          sampling_use_entmax = False,
+                          bos_id=50256,
+                          ):
+    """Sample randomly one token at a time.
+    The partial_sequences represent partial sequences to be continued.  The
+    first tokens of each sequence are nonzero representing the given partial
+    sequences and the last tokens of each sequence are zeros, representing what
+    needs to be filled in.
+    If there are no partial sequences (you want to sample from the beginning),
+    then pass partial_sequences=mtf.zeros(mesh, shape, dtype=tf.int32) and
+    has_partial_sequences=False (so we can skip computation).
+    Args:
+        partial_sequences: an int32 Tensor with shape [<batch_dims>, length_dim]
+        stop_at_token: an optional integer eos id.  Stop when we produce it.
+        max_steps: an optional integer, the max number of steps to decode.
+        temperature: an optional floating point value between 0.0 and 1.0 0.0
+        means argmax, 1.0 means sample according to predicted distribution.
+        variable_dtype: a mtf.VariableDType
+        encoder_output: an optional Tensor
+        encoder_sequence_id: an optional Tensor
+        encoder_inputs: an optional Tensor
+        shared_params: an optional dictionary
+        has_partial_sequences: a boolean
+        encoder_layer_outputs: optional - readonly list of tensor activations when
+        decoding, one per each input layer + the embedding layer
+        never_end: a boolean - if set, then avoid generating stop_at_token
+        remove_partial_sequences: a boolean - whether to remove the partial
+        sequences from the output
+        sampling_keep_top_k: an integer - if not -1, only sample from the top k
+        logits.
+        bos_id: beginning of sequence id
+    Returns:
+        a Tensor with shape [<batch_dims>, length_dim]
+    """
+    inputs = partial_sequences  # Partial sequences to fill in
+    batch_dims = inputs.shape.dims[:-1]
+    length_dim = inputs.shape.dims[-1]
+    padding_id = params.get("padding_id", 0)
+    slow_sampling = params.get("slow_sampling", False)
+    initial_position = mtf.reduce_sum(
+        mtf.to_int32(mtf.not_equal(inputs, padding_id)), reduced_dim=length_dim)  # Gets position where zero padding starts
+    length_range = mtf.range(inputs.mesh, length_dim, tf.int32)
+    input_full_attention = True  # for now hardcode this to true bc lazy
+    if input_full_attention:
+        # Vanilla autoregressive model - each position can see previous positions.
+        # Think this feeds in to the loop fn and tells each position where it can attend to?
+        read_priority = write_priority = length_range * mtf.to_int32(
+            mtf.greater(length_range, initial_position))
+    else:
+        read_priority = write_priority = length_range
+    # Builds context to pass around internally
+    # The 'first part' context records initial states of k / v / x
+    if not slow_sampling:
+        context_first_part = mtf_transformer.transformer.Context(
+            model=None,
+            mesh=inputs.mesh,
+            batch_dims=batch_dims,
+            length_dim=length_dim,
+            variable_dtype=variable_dtype,
+            mode="first_part",
+            position=length_range,
+            position_is_default=True,
+            new_states=[],
+            initial_position=initial_position,
+            sequence_id=None,
+            encoder_output=encoder_output,
+            encoder_sequence_id=encoder_sequence_id,
+            constant_states=[],
+            shared_params=shared_params,
+            encoder_layer_outputs=encoder_layer_outputs,
+            write_priority=write_priority,
+            read_priority=read_priority,
+            inputs=inputs,
+            encoder_inputs=encoder_inputs)
+        with tf.variable_scope("gpt2"):
+            logits, _, _ = gpt2.model({"inputs": inputs}, other_features, params, inputs.mesh, variable_dtype=variable_dtype, context=context_first_part)
+        if not has_partial_sequences:
+            initial_states = [mtf.zeros_like(t) for t in context_first_part.new_states]
+        else:
+            initial_states = context_first_part.new_states
+    else:
+        initial_states = []
+    if not has_partial_sequences:
+        partial_sequences_eos_count = 0
+    if stop_at_token is not None:
+        partial_sequences_eos_count = mtf.reduce_sum(
+            mtf.to_int32(mtf.equal(partial_sequences, stop_at_token)),
+            reduced_dim=length_dim)
+    def cond_fn(position, ids, *unused_states):
+        """Should we run another loop iteration?"""
+        past_end = mtf.greater_equal(position, length_dim.size)
+        if max_steps:
+            past_end = mtf.logical_or(
+                past_end, mtf.greater_equal(position - initial_position, max_steps))
+        is_done = past_end
+        if stop_at_token is not None:
+            eos_count = mtf.reduce_sum(
+                mtf.to_int32(mtf.equal(ids, stop_at_token)),
+                reduced_dim=length_dim)
+            has_additional_eos = mtf.greater(eos_count, partial_sequences_eos_count)
+            is_done = mtf.logical_or(is_done, has_additional_eos)
+        all_done = mtf.reduce_all(is_done)
+        return mtf.logical_not(all_done)
+    def body_fn(position, ids, *states):
+        """One step in the decode loop."""
+        nonlocal sampling_keep_top_k
+        context = mtf_transformer.transformer.Context(
+            model=None,
+            mesh=inputs.mesh,
+            batch_dims=batch_dims,
+            length_dim=length_dim,
+            variable_dtype=variable_dtype,
+            mode="incremental",
+            position=position,
+            position_is_default=True,
+            states=states,
+            new_states=[],
+            initial_position=position,
+            sequence_id=None,
+            encoder_output=encoder_output,
+            encoder_sequence_id=encoder_sequence_id,
+            shared_params=shared_params,
+            encoder_layer_outputs=encoder_layer_outputs,
+            write_priority=write_priority,
+            read_priority=read_priority,
+            inputs=ids,
+            encoder_inputs=encoder_inputs) if not slow_sampling else None
+        with tf.variable_scope("gpt2", reuse=tf.AUTO_REUSE):
+            logits, _, _ = gpt2.model({"inputs": ids}, other_features, params, inputs.mesh, variable_dtype=variable_dtype, context = context)
+        if not sampling_use_entmax:
+            # By default, do top_k sampling of 0.9
+            if sampling_keep_top_k == -2:
+                sampling_keep_top_k = int(logits.shape[-1].size * 0.1)
+            if sampling_keep_top_k != -1:
+                if sampling_keep_top_k <= 0:
+                    raise ValueError("sampling_keep_top_k must either be -1 or positive.")
+                k_largest = mtf.nth_largest_element(
+                    logits, n=sampling_keep_top_k,
+                    reduced_dim=other_features["vocab_dim"])
+                logits = mtf.where(mtf.less_equal(logits, k_largest),
+                                   mtf.ones_like(logits) * -1e6, logits)
+            ids_this_step = mtf.sample_with_temperature(
+                logits, other_features["vocab_dim"], temperature)
+        else:
+            ids_this_step = sample_categorical(entmax(logits))
+        if slow_sampling:
+            ids_this_step = mtf.shift(ids_this_step, offset=1, dim=length_dim, wrap=False)
+        else:
+            ids_this_step = mtf.reshape(ids_this_step, (batch_dims))
+        one_hot = mtf.one_hot(position, length_dim, dtype=tf.int32)
+        one_new_id = ids_this_step * one_hot
+        new_ids = (1 - one_hot) * ids + one_new_id
+        new_position = position + 1
+        ret = [new_position, new_ids]
+        if context is not None:
+            ret += context.new_states
+        return ret
+    while_loop_inputs = [initial_position, inputs] + initial_states
+    final_position, outputs = mtf.while_loop(
+        cond_fn, body_fn, while_loop_inputs)[:2]
+    del final_position
+    if has_partial_sequences and remove_partial_sequences:
+        # Remove partial sequences from outputs
+        partial_length = mtf.reduce_sum(
+            mtf.to_int32(mtf.not_equal(partial_sequences, padding_id)),
+            reduced_dim=length_dim)
+        outputs = mtf.dynamic_shift(
+            outputs, -partial_length, length_dim, wrap=False)
+    return outputs

tasks.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os.path
+import json
+import requests
+import numpy as np
+import ftfy
+from data.encoders import fetch_encoder, encode
+import tensorflow as tf
+import re
+from functools import partial
+lambada_src_uri = 'http://eaidata.bmk.sh/data/lambada_test.jsonl'
+normalization = 'NFKC'
+# Note: this task is called "lambada" but it really refers to OpenAI's version
+# of the task, which actually differs in some ways from the task described in
+# the original paper. So, strictly speaking, accuracy values from this task
+# should not be compared to accuracy values from the original lambada task.
+# For more information, see
+#   https://github.com/openai/gpt-2/issues/131
+def lambada_create_tokens_data(params, path):
+    with open(path, 'w') as f:
+        req = requests.get(lambada_src_uri)
+        req.raise_for_status()
+        jsons = [json.loads(l) for l in req.iter_lines()]
+        texts = [ftfy.fix_text(j['text'], normalization=normalization) for j in jsons]
+        enc = fetch_encoder(params)
+        arrays = [encode(enc, t) for t in texts]
+        json.dump(arrays, f)
+        return arrays
+def lambada_read_or_create_tokens_data(params, path):
+    # if you tell me where the file should go, i will helpfully create it for you
+    if not os.path.exists(path):
+        return lambada_create_tokens_data(params, path)
+    with open(path) as f:
+        return json.load(f)
+def bin_pack(params, tokens_data):
+    eos_token = params['eos_id']
+    n_ctx = params['n_ctx']
+    dummy_token = 1
+    pad_batch_size = params['eval_batch_size']
+    bins = []
+    for a in tokens_data:
+        if len(bins) == 0 or len(bins[-1]) + len(a) + 1 > n_ctx:
+            bins.append([])
+        bins[-1] += a
+        bins[-1].append(eos_token)
+    while len(bins) % pad_batch_size != 0:
+        bins.append([])
+    bins_array = np.full((len(bins), n_ctx), dummy_token, dtype=np.uint16)
+    for i, b in enumerate(bins):
+        bins_array[i, 0:len(b)] = b
+    return bins_array
+def lambada_init(params):
+    ds_configs = params['dataset_configs']
+    l = [
+        ds_configs[ds_id].get('lambada_tokens_path', "./lambada.json")
+        for ds_id, _, _, _ in params['datasets']
+    ]
+    assert len(l) > 0, 'lambada_tokens_path not found in the dataset config'
+    lt_path = l[0]
+    assert lt_path.endswith('.json'), 'lambada_tokens_path must have extension json'
+    tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
+    bins_array = bin_pack(params, tokens_data)
+    params['lambada_tokens_path'] = lt_path
+    params['lambada_n_steps'] = len(bins_array) // params['eval_batch_size']
+def lambada_get_task_info(params):
+    return {
+        'n_steps': params['lambada_n_steps'],
+    }
+# The LAMBADA evaluation code looks at the logits of each position just before an eos_token
+def lambada_input(params):
+    eos_token = 50256 if params['n_vocab'] >= 50257 else 0
+    n_ctx = params['n_ctx']
+    lt_path = params['lambada_tokens_path']
+    tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
+    bins_array = bin_pack(params, tokens_data)
+    dataset = tf.data.Dataset.from_tensor_slices(bins_array)
+    def _get_output(bin):
+        bin = tf.cast(bin, dtype=tf.int32)
+        indexes = tf.range(n_ctx)
+        results = tf.gather(bin, (indexes + 1) % n_ctx)
+        eos_next_positions = tf.math.equal(tf.gather(bin, (indexes + 2) % n_ctx), eos_token)
+        output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
+        bin = tf.reshape(bin, [n_ctx])
+        bin = tf.cast(bin, dtype=tf.int32)
+        output = tf.reshape(output, [n_ctx])
+        output = tf.cast(output, dtype=tf.int32)
+        return bin, output
+    dataset = dataset.map(_get_output)
+    dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
+    dataset = dataset.repeat()
+    return dataset
+task_descriptors = {
+    'lambada': {
+        'init_fn': lambada_init,
+        'get_task_info_fn': lambada_get_task_info,
+        'input_fn': lambada_input,
+    }
+}

test_models.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import pytest
+import traceback
+import logging
+from collections import defaultdict
+from contextlib import contextmanager
+import tensorflow as tf
+tf.compat.v1.enable_eager_execution()
+import mesh_tensorflow as mtf
+from mesh_tensorflow import placement_mesh_impl
+from inputs import mlm_sample_text
+from models.gpt2 import gpt2
+from models.utils import biasmask_attn_weights, entmax, sample_categorical
+from sample import sample_autoregressive
+# helper functions
+@contextmanager
+def not_raises(exception):
+    try:
+        yield
+    except exception:
+        logging.error(traceback.format_exc())
+        raise pytest.fail("DID RAISE {0}".format(exception))
+# fixtures
+params = defaultdict(lambda: None, {
+    "n_head": 1,
+    "n_ctx": 4,
+    "n_embd": 2,
+    "n_vocab": 256,
+    "embed_dropout": 0.,
+    "n_layer": 2,
+    "num_microbatches": 1,
+    "train_batch_size": 1,
+    "causal": True,
+    "attention_types": ['global', 'local'],
+    "res_dropout": 0.1,
+    "rotary_emb": True,
+    "activation_function": "gelu",
+    "moe_layers": (1,),
+    "num_mem_kv": 16,
+    "no_weight_tie": True,
+    "moe_params": {
+        'moe_dropout_rate': 0.0
+    },
+    "mesh_shape": [],
+    "layout": {},
+    "local_attention_radius": 128,
+    "share_parameters": True,
+    "rezero": True
+})
+# tests
+def test_model():
+    graph = mtf.Graph()
+    mesh = mtf.Mesh(graph, "my_mesh")
+    seq_len = params["n_ctx"]
+    batch_dim = mtf.Dimension("batch", 1)
+    sequence_dim = mtf.Dimension("sequence", seq_len)
+    features = {
+        'inputs': mtf.ones(mesh, mtf.Shape((batch_dim, sequence_dim)), tf.int32),
+        'labels': mtf.ones(mesh, mtf.Shape((batch_dim, sequence_dim)), tf.int32)
+    }
+    # create mask
+    num_mem_kv = params.get('num_mem_kv', 0)
+    length_dim = mtf.Dimension('sequence', seq_len)
+    memory_length_dim = mtf.Dimension('memory_length', seq_len + num_mem_kv)
+    embed_sequence_dim = mtf.Dimension('embed_sequence', seq_len)
+    embd_dim = mtf.Dimension("embd", params["n_embd"])
+    vocab_dim = mtf.Dimension("vocab", params["n_vocab"])
+    other_features = {}
+    variable_dtype = mtf.VariableDType(tf.float32, tf.float32, tf.float32)
+    other_features["attn_bias"] = biasmask_attn_weights(mesh, length_dim, memory_length_dim, variable_dtype)
+    other_features["embd_dim"] = embd_dim
+    other_features["vocab_dim"] = vocab_dim
+    other_features["embed_sequence_dim"] = embed_sequence_dim
+    other_features["memory_length_dim"] = memory_length_dim
+    with not_raises(Exception):
+        logits, _, _ = gpt2.model(features, other_features, params, mesh, variable_dtype=variable_dtype)
+        mesh_impl = placement_mesh_impl.PlacementMeshImpl(shape=[], layout={}, devices=[""])
+        lowering = mtf.Lowering(graph, {mesh: mesh_impl})
+        logits = lowering.export_to_tf_tensor(logits)
+def test_sampling():
+    graph = mtf.Graph()
+    mesh = mtf.Mesh(graph, "my_mesh")
+    batch_dim = mtf.Dimension("batch", 1)
+    sequence_dim = mtf.Dimension("sequence", 1)
+    inputs = mtf.ones(mesh, mtf.Shape((batch_dim, sequence_dim)), tf.int32)
+    inputs = mtf.pad(inputs, [0, 3], sequence_dim.name)
+    # create mask
+    seq_len = params["n_ctx"]
+    num_mem_kv = params.get('num_mem_kv', 0)
+    length_dim = mtf.Dimension('sequence', seq_len)
+    memory_length_dim = mtf.Dimension('memory_length', seq_len + num_mem_kv)
+    embed_sequence_dim = mtf.Dimension('embed_sequence', seq_len)
+    embd_dim = mtf.Dimension("embd", params["n_embd"])
+    vocab_dim = mtf.Dimension("vocab", params["n_vocab"])
+    other_features = {}
+    other_features["attn_bias"] = biasmask_attn_weights(mesh, length_dim, memory_length_dim, mtf.VariableDType(tf.float32))
+    other_features["embd_dim"] = embd_dim
+    other_features["vocab_dim"] = vocab_dim
+    other_features["embed_sequence_dim"] = embed_sequence_dim
+    other_features["memory_length_dim"] = memory_length_dim
+    params["mode"] = "predict"
+    with not_raises(Exception):
+        samples = sample_autoregressive(
+            inputs, other_features=other_features, params=params, variable_dtype=mtf.VariableDType(),
+            remove_partial_sequences=params["remove_partial_sequences"], stop_at_token=params["eos_id"], sampling_use_entmax=True)
+        mesh_impl = placement_mesh_impl.PlacementMeshImpl(shape=[], layout={}, devices=[""])
+        lowering = mtf.Lowering(graph, {mesh: mesh_impl})
+        samples = lowering.export_to_tf_tensor(samples)
+# mlm
+mlm_params = defaultdict(lambda: None, {
+    "n_head": 1,
+    "n_ctx": 4,
+    "n_embd": 1,
+    "n_vocab": 256,
+    "embed_dropout": 0.,
+    "n_layer": 2,
+    "num_microbatches": 1,
+    "train_batch_size": 1,
+    "attention_types": ['global', 'local'],
+    "res_dropout": 0.1,
+    "mesh_shape": [],
+    "layout": {},
+    "share_parameters": True,
+    "mlm_training": True,
+    "mlm_mask_id": 3,
+    "mlm_cls_token_id": 4,
+    "mlm_random_token_prob": 0.1
+})
+def test_mlm_sample_text():
+    document = tf.random.normal((16,))
+    with not_raises(Exception):
+        features, labels = mlm_sample_text(mlm_params, document, random_documents = True)
+        assert features.shape == (mlm_params['n_ctx'],)
+# entmax
+def test_entmax():
+    graph = mtf.Graph()
+    mesh = mtf.Mesh(graph, "my_mesh")
+    length = mtf.Dimension("tensor_length", 8)
+    tensor = mtf.range(mesh, length, tf.float32)
+    output = entmax(tensor)
+    grad = mtf.gradients([output], [tensor])[0]
+    sample = sample_categorical(output, length)
+    mesh_impl = placement_mesh_impl.PlacementMeshImpl(shape=[], layout={}, devices=[""])
+    lowering = mtf.Lowering(graph, {mesh: mesh_impl})
+    sample = lowering.export_to_tf_tensor(sample)
+    grad = lowering.export_to_tf_tensor(grad)

utils.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import re
+from urllib.parse import urlparse
+from shutil import rmtree
+import logging
+import os
+from pathlib import Path
+import sys
+import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf2
+import mesh_tensorflow as mtf
+from data.encoders import fetch_encoder
+import re
+def setup_logging(args):
+    Path("logs").mkdir(exist_ok=True)
+    tf.logging.set_verbosity(logging.INFO)
+    tf.get_logger().propagate = False  # Remove double log on console
+    name = os.path.splitext(os.path.basename(args.model))[0]
+    handlers = [
+        logging.FileHandler(f"logs/{name}.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+    logger = logging.getLogger("tensorflow")
+    logger.handlers = handlers
+    return logger
+def get_batch_size(params):
+    return params[f"{params['mode']}_batch_size"]
+def add_mode_to_params(params, mode):
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        params["mode"] = "predict"
+    elif mode == tf.estimator.ModeKeys.EVAL:
+        params["mode"] = "eval"
+    elif mode == tf.estimator.ModeKeys.TRAIN:
+        params["mode"] = "train"
+    else:
+        raise ValueError(f"Invalid mode {mode}")
+    return params
+def simd_mesh_setup(params, mesh_shape, layout_rules):
+    """Constructs SimdMesh function - instructions on how to evenly split tensors across all TPU cores"""
+    num_hosts = params["context"].num_hosts
+    host_placement_fn = params["context"].tpu_host_placement_function
+    device_list = [host_placement_fn(host_id=i) for i in range(num_hosts)]
+    tf.logging.info(f"device_list = {device_list}")
+    # TODO: Better estimation of replica cache size?
+    replica_cache_size = 300 * 1000000  # 300M per replica
+    # Worker 0 caches all the TPU binaries
+    worker0_mem = replica_cache_size * params["context"].num_replicas
+    devices_memory_usage = [worker0_mem] + [0] * (num_hosts - 1)
+    var_placer = mtf.utils.BalancedVariablePlacer(device_list, devices_memory_usage)
+    mesh_devices = [""] * mesh_shape.size
+    mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
+        mesh_shape, layout_rules, mesh_devices, params["context"].device_assignment)
+    return var_placer, mesh_impl
+def remove_batch_from_layout(layout):
+    """
+    The tf-mesh layout splits across batch size, remove it.
+    Useful for prediction steps, when you no longer want large batches.
+    :param layout: string describing tf-mesh layout
+    :return: layout minus batch dimension
+    """
+    layout = layout.split(',')
+    ret_layout = ""
+    for i in layout:
+        if "batch" in i:
+            pass
+        else:
+            ret_layout += f"{i},"
+    return ret_layout[:-1]
+def yes_or_no(question):
+    while True:
+        reply = str(input(question+' (y/n): ')).lower().strip()
+        if reply[:1] == 'y':
+            return True
+        if reply[:1] == 'n':
+            return False
+def remove_gs_or_filepath(path):
+    parsed_url = urlparse(path)
+    if parsed_url.scheme == "gs":
+        os.system(f"gsutil rm -rf {path}")
+        return
+    rmtree(path)
+def save_config(params_dict, logdir):
+    print(f"Saving config to {logdir}")
+    text = "{\n\n"
+    total_params = len(params_dict)
+    for count, key in enumerate(params_dict):
+        config_value = str(params_dict[key])
+        if re.search('[a-zA-Z]', config_value):
+            if config_value.lower() != 'true':
+                if config_value.lower() != 'false':
+                    if config_value[0] != '[':
+                        # TODO: Making a manual exception for parsing epsilon right now since it's the only number in
+                        #       scientific notation. Should fix this.
+                        if key != "epsilon":
+                            config_value = f'"{config_value}"'
+        if count == total_params - 1:
+            text += f'"{str(key)}"' + ' : ' + config_value + '\n\n'
+        else:
+            text += f'"{str(key)}"'  + ' : ' + config_value + ',\n\n'
+    text += '\n\n}'
+    sess = tf.InteractiveSession()
+    summary_op = tf.summary.text("run_config", tf.convert_to_tensor(text))
+    summary_writer = tf.summary.FileWriter(f"{logdir}/config", sess.graph)
+    text = sess.run(summary_op)
+    summary_writer.add_summary(text, 0)
+    summary_writer.flush()
+    summary_writer.close()
+    tf.reset_default_graph()
+    print('Done!')
+def expand_attention_types_params(params_list):
+    newlist = []
+    for item in params_list:
+        for _ in range(item[1]):
+            newlist.extend(item[0])
+    return newlist
+def get_n_trainable_vars(graph):
+    """
+    Gets number of trainable vars in a MTF model.
+    :param graph: Mesh-Tensorflow graph
+    :return: None
+    """
+    total_parameters = 0
+    for variable in graph.trainable_variables:
+      shape = variable.shape.dims
+      variable_parameters = 1
+      for dim in shape:
+          variable_parameters *= dim.size
+      total_parameters += variable_parameters
+    print(f"\n\nN TRAINABLE VARS:\n{total_parameters:,}\n\n")
+def print_dim_names(graph):
+    """
+    Print names of all Dimensions
+    :param graph: Mesh-Tensorflow graph
+    :return: None
+    """
+    all_dim_names = []
+    for variable in graph.all_variables:
+        names = variable.shape.dimension_names
+        all_dim_names.append(names)
+    # Print all dim names in graph & write to file
+    all_dim_names = [item for sublist in all_dim_names for item in sublist] # Flatten all dims
+    unique_dims = list(set(all_dim_names))
+    print("ALL DIM NAMES:")
+    for dim_name in unique_dims:
+        print(dim_name)
+    print('\n')
+def get_graph_info(graph):
+    """
+    Wrapper fn that calculates number of trainable vars in an MTF graph & prints all dim_names to file
+    TODO: how to get un-trainable dim-names too, batch etc.
+    :param graph: Mesh-Tensorflow graph
+    :return: None
+    """
+    get_n_trainable_vars(graph)
+    print_dim_names(graph)
+def loss_denominator(targets, num_microbatches):
+    """Denominator applied to losses.
+    This is usually the size of the targets tensor (omitting ensemble
+    dimensions).  Alternatively, it is an override value passed to the
+    class constructor.
+    Args:
+      targets: a mtf.Tensor
+      num_microbatches: an integer - greater than one if the step has been
+        serialized into multiple microbatches to save memory.
+    Returns:
+      a float
+    """
+    ret = float(targets.shape.size) * num_microbatches
+    return float(ret)
+def check_dataset(input_fn, params, global_step=None):
+    tf.enable_eager_execution()
+    if global_step is not None:
+        dataset = input_fn(params, global_step=global_step)
+    else:
+        dataset = input_fn(params)
+    dataset_iter = dataset.make_one_shot_iterator()
+    tensor, _ = next(dataset_iter)
+    enc = fetch_encoder(params)
+    for p in tensor[:1]:
+        txt = enc.decode(p)
+    print('-' * 50)
+    print(txt[:500], '\n\n...\n\n', txt[-500:])
+    print('-' * 50)
+    exit()
+def auto_layout(graph, mesh_shape, logits, loss):
+    layout_rules = mtf.auto_mtf.layout(graph, mesh_shape, [logits, loss])
+    print(f"Auto-selected layout:\n{layout_rules}\nRe-initialize graph with selected layout")
+    quit()
+def auto_layout_and_mesh_shape(graph, num_cores, logits, loss):
+    layout_rules, mesh_shape = mtf.auto_mtf.layout_and_mesh_shape(graph, num_cores,
+                                                                    [logits, loss], max_mesh_shape_dimensions=4)
+    print(f"Num cores:\n{num_cores}\nAuto-selected layout:\n{layout_rules}\nAuto-selected mesh shape:\n{mesh_shape}" \
+            f"\nRe-initialize graph with selected layout & mesh shape")
+    quit()
+def create_host_call(model_dir):
+    """Construct a host_call writing scalar summaries.
+    Borrowed from t2t.
+    Args:
+        model_dir: String containing path to train
+    Returns:
+        (fn, args) Pair to be called by TPUEstimator as the host_call.
+    """
+    graph = tf.get_default_graph()
+    # A list of (name, lowered tensor) tuples
+    summaries = graph.get_collection(mtf.utils.SCALAR_SUMMARIES_COLLECTION_KEY)
+    def maybe_cast(tensor):
+        assert tensor.shape.is_compatible_with([]), tensor.name
+        if tensor.dtype == tf.int64:
+            return tf.to_int32(tensor)
+        if tensor.dtype == tf.bfloat16:
+            return tf.cast(tensor, tf.float32)
+        return tensor
+    reshaped_tensors = [tf.reshape(maybe_cast(t), [1]) for _, t in summaries]
+    # When no supported summaries are found, don't create host_call. Otherwise,
+    # TPU outfeed queue would enqueue global_step while host_call doesn't dequeue
+    # it, eventually causing hang.
+    if not reshaped_tensors:
+        return None
+    def host_call_fn(global_step, *args):
+        """Training host call. Creates scalar summaries for training metrics."""
+        # This function is executed on the CPU and should not directly reference
+        # any Tensors in the rest of the `model_fn`. To pass Tensors from the
+        # model to the `model_fn`, provide as part of the `host_call`.
+        global_step = tf.cast(global_step[0], tf.int64)
+        with tf2.summary.create_file_writer(model_dir).as_default():
+            # We cannot directly use any tensor from summaries, because each
+            # tensor here must be a concat of multiple tensors from all shards.
+            # Therefore, we rely on the assumption that args wil have the same
+            # length as summaries, and all tensors in args will have the same
+            # order of self._tup_summaries.
+            assert len(args) == len(summaries)
+            for i, tensor in enumerate(args):
+                name = summaries[i][0]
+                tf2.summary.scalar(name, tf.reduce_mean(tensor), step=global_step)
+        return tf.summary.all_v2_summary_ops()
+    global_step_t = tf.reshape(tf.to_int32(tf.train.get_global_step()), [1])
+    return host_call_fn, [global_step_t] + reshaped_tensors
+def natural_sort(l):
+    convert = lambda text: int(text) if text.isdigit() else text.lower()
+    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
+    return sorted(l, key = alphanum_key)