32k retrieval model

Files changed (5) hide show

README.md CHANGED Viewed

@@ -1,3 +1,15 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+language:
+- en
+pipeline_tag: text-classification
 ---
+# Monarch Mixer-BERT
+The 80M checkpoint for M2-BERT-base from the paper [Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture](https://arxiv.org/abs/2310.12109).
+This model has been pretrained with sequence length 32768, and it has been fine-tuned for retrieval.
+This model was trained by Dan Fu, Jon Saad-Falcon, and Simran Arora.
+Check out our [GitHub](https://github.com/HazyResearch/m2/tree/main) for instructions on how to download and fine-tune it!

config.json ADDED Viewed

+{
+    "model_type": "m2_bert"
+}

config.yaml ADDED Viewed

+# Note that some of the fields in this template haven't been filled in yet.
+# Please resolve any `null` fields before launching!
+precision: amp_bf16
+max_seq_len: 8192
+# Tokenizer for dataset creation
+tokenizer_name: bert-base-uncased
+# Base model config
+model:
+  name: bert
+  pretrained_model_name: ${tokenizer_name}
+  tokenizer_name: ${tokenizer_name}
+  model_config:
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    attention_probs_dropout_prob: 0.0
+    max_position_embeddings: 8192
+    monarch_mixer_sequence_mixing: True
+    long_conv_l_max: 8192
+    long_conv_kernel_learning_rate: 1e-3
+    hyena_lr_pos_emb: 1e-5
+    hyena_w: 10
+    hyena_wd: 0.1
+    hyena_emb_dim: 5
+    hyena_filter_order: 128
+    hyena_training_additions: False
+    bidirectional: true
+    residual_long_conv: true
+    use_glu_mlp: True
+    use_monarch_mlp: True
+    monarch_mlp_nblocks: 4
+    use_positional_encodings: True

model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b767627648f4f8b55274530cac6971aa5056f59f4197c35e7b3729bd7fab7073
+size 440881303

version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1