File size: 13,424 Bytes

484d56b

#include <torch/torch.h>
#include <iostream>
#include <vector>

// Define device
torch::Device device(torch::kCUDA);

// Define constants
const int batch_size = 8;
const int block_size = 32;
const int max_iters = 1000;
const int eval_interval = 50;
const int eval_iters = 5;
const int d_model = 256;
const int n_layer = 16;
const int n_head = 12;
const float dropout = 0.2;
const float norm_eps = 1e-5;
const int vocab_size = 5;

// sample data
torch::Tensor train_data = torch::rand({1000, block_size});
torch::Tensor val_data = torch::rand({500, block_size});

// Data loading function
std::pair<torch::Tensor, torch::Tensor> get_batch(const std::string& split) {
    torch::Tensor data = (split == "train") ? train_data : val_data;
    torch::Tensor ix = torch::randint(data.size(0) - block_size, {batch_size});
    torch::Tensor x = torch::empty({batch_size, block_size});
    torch::Tensor y = torch::empty({batch_size, block_size});
    for (int i = 0; i < batch_size; ++i) {
        x[i] = data.index({ix[i], ix[i] + block_size});
        y[i] = data.index({ix[i] + 1, ix[i] + block_size + 1});
    }
    return std::make_pair(x.to(device), y.to(device));
}

// Custom classes and functions
class SWiGLU : public torch::nn::Module {
public:
    SWiGLU() {}

    torch::Tensor forward(torch::Tensor x) {
        torch::Tensor sigmoid_output = torch::sigmoid(x);
        torch::Tensor relu_output = torch::relu(x);
        torch::Tensor out = sigmoid_output * relu_output + (1 - sigmoid_output) * x;
        return out;
    }
};

class UnMaskedHeadImpl : public torch::nn::Module {
public:
    UnMaskedHeadImpl(int d_model, int head_size, float dropout)
        : key(register_module("key", torch::nn::Linear(d_model, head_size))),
          query(register_module("query", torch::nn::Linear(d_model, head_size))),
          value(register_module("value", torch::nn::Linear(d_model, head_size))),
          dropout(torch::nn::Dropout(dropout)) {
        register_module("dropout", dropout);
    }

    torch::Tensor forward(torch::Tensor x) {
        torch::Tensor key_out = key->forward(x);
        torch::Tensor query_out = query->forward(x);
        
        torch::Tensor weights = query_out.matmul(key_out.transpose(-2, -1)) * std::sqrt(key_out.size(-1));
        weights = torch::softmax(weights, -1);
        weights = dropout(weights);

        torch::Tensor value_out = value->forward(x);
        torch::Tensor out = weights.matmul(value_out);
        return out;
    }

private:
    torch::nn::Linear key, query, value;
    torch::nn::Dropout dropout;
};

TORCH_MODULE(UnMaskedHead);

class MaskedHeadImpl : public torch::nn::Module {
public:
    MaskedHeadImpl(int head_size, float dropout, int d_model)
        : key(register_module("key", torch::nn::Linear(d_model, head_size))),
          query(register_module("query", torch::nn::Linear(d_model, head_size))),
          value(register_module("value", torch::nn::Linear(d_model, head_size))),
          dropout(torch::nn::Dropout(dropout)) {
        register_buffer("tril", torch::tril(torch::ones(block_size, block_size)));
    }

    torch::Tensor forward(torch::Tensor x) {
        torch::Tensor key_out = key->forward(x);
        torch::Tensor query_out = query->forward(x);
        
        torch::Tensor weights = query_out.matmul(key_out.transpose(-2, -1)) * std::sqrt(key_out.size(-1));
        weights = weights.masked_fill(tril[:x.size(1), :x.size(1)] == 0, std::numeric_limits<float>::lowest());
        weights = torch::softmax(weights, -1);
        weights = dropout(weights);

        torch::Tensor value_out = value->forward(x);
        torch::Tensor out = weights.matmul(value_out);
        return out;
    }

private:
    torch::nn::Linear key, query, value;
    torch::nn::Dropout dropout;
    torch::Tensor tril;
};

TORCH_MODULE(MaskedHead);

class MultiUnMaskedImpl : public torch::nn::Module {
public:
    MultiUnMaskedImpl(int d_model, int n_head, float dropout)
        : proj(register_module("proj", torch::nn::Linear(n_head * (d_model / n_head), d_model))),
          dropout(torch::nn::Dropout(dropout)) {
        for (int i = 0; i < n_head; ++i) {
            heads.push_back(register_module("head" + std::to_string(i), UnMaskedHead(d_model, d_model / n_head, dropout)));
        }
    }

    torch::Tensor forward(torch::Tensor x) {
        std::vector<torch::Tensor> head_outputs;
        for (auto& head : heads) {
            head_outputs.push_back(head->forward(x));
        }
        torch::Tensor out = torch::cat(head_outputs, -1);
        out = dropout(out);
        out = proj(out);
        return out;
    }

private:
    torch::nn::Linear proj;
    torch::nn::Dropout dropout;
    std::vector<UnMaskedHead> heads;
};

TORCH_MODULE(MultiUnMasked);

class MultiMaskedImpl : public torch::nn::Module {
public:
    MultiMaskedImpl(int d_model, int n_head, float dropout)
        : proj(register_module("proj", torch::nn::Linear(n_head * (d_model / n_head), d_model))),
          dropout(torch::nn::Dropout(dropout)) {
        for (int i = 0; i < n_head; ++i) {
            heads.push_back(register_module("head" + std::to_string(i), MaskedHead(d_model, d_model / n_head, dropout)));
        }
    }

    torch::Tensor forward(torch::Tensor x) {
        std::vector<torch::Tensor> head_outputs;
        for (auto& head : heads) {
            head_outputs.push_back(head->forward(x));
        }
        torch::Tensor out = torch::cat(head_outputs, -1);
        out = dropout(out);
        out = proj(out);
        return out;
    }

private:
    torch::nn::Linear proj;
    torch::nn::Dropout dropout;
    std::vector<MaskedHead> heads;
};

TORCH_MODULE(MultiMasked);

class FeedForwardImpl : public torch::nn::Module {
public:
    FeedForwardImpl(int d_model, float dropout)
        : net(register_module("net", torch::nn::Sequential(
            torch::nn::Linear(d_model, 4 * d_model),
            torch::nn::GELU(),
            torch::nn::Linear(4 * d_model, d_model),
            torch::nn::Dropout(dropout)
        ))) {}

    torch::Tensor forward(torch::Tensor x) {
        return net->forward(x);
    }

private:
    torch::nn::Sequential net;
};

TORCH_MODULE(FeedForward);

class BlockImpl : public torch::nn::Module {
public:
    BlockImpl(int d_model, int n_head, float norm_eps, float dropout)
        : sa_masked(MultiMasked(d_model, n_head, dropout)),
          sa_unmasked(MultiUnMasked(d_model, n_head, dropout)),
          ffwd(FeedForward(d_model, dropout)),
          norm1(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))),
          norm2(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))) {}

    torch::Tensor forward(torch::Tensor x) {
        torch::Tensor x2 = x + sa_unmasked->forward(norm1->forward(x));
        x = x2 + ffwd->forward(norm2->forward(x2));

        x2 = x + sa_masked->forward(norm1->forward(x));
        x = x2 + ffwd->forward(norm2->forward(x2));

        return x;
    }

private:
    MultiMasked sa_masked;
    MultiUnMasked sa_unmasked;
    FeedForward ffwd;
    torch::nn::LayerNorm norm1, norm2;
};

TORCH_MODULE(Block);

class EnigmaImpl : public torch::nn::Module {
public:
    EnigmaImpl(int vocab_size, int block_size, int d_model, int n_layer, int n_head, float dropout, float norm_eps)
        : toked_model(register_module("toked_model", torch::nn::Embedding(vocab_size, d_model))),
          pos_encod(register_module("pos_encod", torch::nn::Embedding(block_size, d_model))),
          norm_final(torch::nn::LayerNorm(torch::nn::LayerNormOptions({d_model}).eps(norm_eps))),
          linear_final(register_module("linear_final", torch::nn::Linear(d_model, vocab_size))) {
        for (int i = 0; i < n_layer; ++i) {
            block_layers.push_back(register_module("block" + std::to_string(i), Block(d_model, n_head, norm_eps, dropout)));
        }
        register_buffer("block_size", torch::tensor(block_size));
        _init_weights(this);
    }

    void _init_weights(torch::nn::Module* module) {
        auto parameters = module->named_parameters();
        for (auto& param : parameters) {
            if (param.key().find("weight") != std::string::npos) {
                torch::nn::init::normal_(param.value(), 0.0, 0.02);
            } else if (param.key().find("bias") != std::string::npos) {
                torch::nn::init::zeros_(param.value());
            }
        }
    }

    std::pair<torch::Tensor, torch::Tensor> forward(torch::Tensor idx, torch::Tensor targets=torch::Tensor()) {
        torch::Tensor toked_model_out = toked_model->forward(idx);
        torch::Tensor pos_encod_out = pos_encod->forward(torch::arange(idx.size(1)));
        torch::Tensor x = toked_model_out + pos_encod_out;

        for (auto& block : block_layers) {
            x = block->forward(x);
        }

        torch::Tensor logits = linear_final->forward(norm_final->forward(x));

        if (!targets.numel()) {
            return {logits, torch::Tensor()};
        } else {
            logits = logits.view({-1, logits.size(-1)});
            targets = targets.view({-1});
            torch::Tensor loss = torch::nn::functional::cross_entropy(logits, targets);
            return {logits, loss};
        }
    }

    std::vector<std::vector<std::pair<torch::Tensor, float>>> complex_generate(torch::Tensor idx, int max_new_tokens, float temperature=1.0, int top_k=3, int beam_width=5) {
        std::vector<std::vector<std::pair<torch::Tensor, float>>> completed_beams;
        torch::Tensor current_idx = idx.clone();
        std::vector<std::pair<torch::Tensor, float>> beam = {std::make_pair(current_idx, 0.0)};

        for (int i = 0; i < max_new_tokens; ++i) {
            std::vector<std::pair<torch::Tensor, float>> new_beam;

            for (auto& beam_item : beam) {
                torch::Tensor& current_idx = beam_item.first;
                torch::Tensor logits, loss;
                std::tie(logits, loss) = forward(current_idx);
                logits = logits.index({torch::indexing::Slice(), -1}); // Get last token predictions

                // Apply softmax and temperature
                torch::Tensor probs = torch::nn::functional::softmax(logits / temperature, -1);
                
                // Top-k sampling
                if (top_k > 0) {
                    probs = top_k_filtering(probs, top_k);
                }

                // Sample from the distribution
                torch::Tensor sampled_idx = torch::multinomial(probs, beam_width, true);

                for (int j = 0; j < beam_width; ++j) {
                    torch::Tensor new_idx = torch::cat({current_idx, sampled_idx.index({torch::indexing::Slice(), j})}, 1);
                    torch::Tensor new_log_prob = beam_item.second + torch::log(probs.index({torch::indexing::Slice(), sampled_idx.index({torch::indexing::Slice(), j})}));
                    new_beam.push_back(std::make_pair(new_idx, new_log_prob.item()));
                }
            }

            // Sort new beam by log probabilities
            std::sort(new_beam.begin(), new_beam.end(), [](const std::pair<torch::Tensor, float>& a, const std::pair<torch::Tensor, float>& b) {
                return a.second > b.second;
            });

            // Only keep top beams
            beam = std::vector<std::pair<torch::Tensor, float>>(new_beam.begin(), new_beam.begin() + beam_width);
        }

        completed_beams.push_back(beam);
        return completed_beams;
    }

    std::vector<std::vector<std::pair<torch::Tensor, float>>> top_k_filtering(torch::Tensor logits, int top_k) {
        torch::Tensor top_values, top_indices;
        std::tie(top_values, top_indices) = torch::topk(logits, top_k, -1);

        torch::Tensor min_value = torch::index_select(top_values, -1, torch::tensor({top_k-1}));
        torch::Tensor filtered_logits = torch::where(logits < min_value, torch::full_like(logits, -std::numeric_limits<float>::infinity()), logits);
        return filtered_logits;
    }

private:
    torch::nn::Embedding toked_model, pos_encod;
    std::vector<Block> block_layers;
    torch::nn::LayerNorm norm_final;
    torch::nn::Linear linear_final;
    int block_size;
};

TORCH_MODULE(Enigma);

int main() {
    // Set seed
    torch::manual_seed(1400);

    // Create model
    Enigma model(vocab_size, block_size, d_model, n_layer, n_head, dropout, norm_eps);
    model->to(device);

    // Define optimizer
    torch::optim::AdamW optimizer(model->parameters(), torch::optim::AdamWOptions(learning_rate));

    // Training loop
    std::vector<float> train_losses, val_losses;
    for (int iter = 0; iter < max_iters; ++iter) {
        if (iter % eval_interval == 0 || iter == max_iters - 1) {
            // Evaluate and print losses
            auto losses = estimate_loss();
            std::cout << "step " << iter << ": train loss " << losses["train"] << ", val loss " << losses["val"] << std::endl;
            
            // Save losses for plotting
            train_losses.push_back(losses["train"]);
            val_losses.push_back(losses["val"]);
        }

        // Get batch, forward pass, loss calculation, backward pass, optimizer step
        auto [xb, yb] = get_batch("train");
        torch::Tensor logits, loss;
        std::tie(logits, loss) = model->forward(xb, yb);

        optimizer.zero_grad();
        loss.backward();
        optimizer.step();
    }

    return 0;
}