From b04652661fa85918f9a64c97ae4664014331eaae Mon Sep 17 00:00:00 2001
From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com>
Date: Tue, 8 Jul 2025 20:07:12 +0530
Subject: [PATCH 1/5] Create train.cpp

---
 GPT2/train.cpp | 153 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 GPT2/train.cpp
diff --git a/GPT2/train.cpp b/GPT2/train.cpp
new file mode 100644
index 0000000..67e8c42
--- /dev/null
+++ b/GPT2/train.cpp
@@ -0,0 +1,153 @@
+#include <torch/torch.h>
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <cassert>
+
+struct Config {
+    int64_t block_size = 256;
+    int64_t vocab_size = 50257;
+    int64_t n_layer = 6;
+    int64_t n_head = 6;
+    int64_t n_embed = 384;
+};
+
+struct TextLoader {
+    TextLoader(int64_t B, int64_t T) : B(B), T(T), cur_pos(0) {
+        std::ifstream in("/teamspace/studios/this_studio/input.txt");
+        int64_t tok;
+        while (in >> tok) tokens.push_back(tok);
+        size_t N = tokens.size();
+        std::cout << "loaded " << N << " tokens\n";
+        std::cout << "processing " << (N / (B * T)) << " batches per epoch\n";
+    }
+    std::pair<torch::Tensor, torch::Tensor> next_batch() {
+        int64_t span = B * T + 1;
+        if (cur_pos + span > tokens.size()) cur_pos = 0;
+        auto start = tokens.begin() + cur_pos;
+        std::vector<int64_t> buf(start, start + span);
+        cur_pos += B * T;
+        auto data = torch::from_blob(buf.data(), {span}, torch::kInt64).clone();
+        auto x = data.narrow(0, 0, B * T).view({B, T});
+        auto y = data.narrow(0, 1, B * T).view({B, T});
+        return {x, y};
+    }
+    int64_t B, T, cur_pos;
+    std::vector<int64_t> tokens;
+};
+
+struct CausalSelfAttentionImpl : torch::nn::Module {
+    CausalSelfAttentionImpl(const Config& cfg) {
+        n_embed = cfg.n_embed;
+        n_head = cfg.n_head;
+        head_dim = n_embed / n_head;
+        qkv = register_module("qkv", torch::nn::Linear(n_embed, 3 * n_embed));
+        proj = register_module("proj", torch::nn::Linear(n_embed, n_embed));
+        auto m = torch::ones({cfg.block_size, cfg.block_size}, torch::kBool).tril().view({1, 1, cfg.block_size, cfg.block_size});
+        mask = m;
+        register_buffer("mask", mask);
+    }
+    torch::Tensor forward(const torch::Tensor& x) {
+        auto B = x.size(0), T = x.size(1);
+        auto qkv_out = qkv->forward(x).view({B, T, 3, n_head, head_dim});
+        auto q = qkv_out.select(2, 0).permute({0, 2, 1, 3});
+        auto k = qkv_out.select(2, 1).permute({0, 2, 1, 3});
+        auto v = qkv_out.select(2, 2).permute({0, 2, 1, 3});
+        auto y = at::scaled_dot_product_attention(q, k, v, mask, 0.0, false);
+        auto out = y.permute({0, 2, 1, 3}).contiguous().view({B, T, n_embed});
+        return proj->forward(out);
+    }
+    int64_t n_embed, n_head, head_dim;
+    torch::nn::Linear qkv{nullptr}, proj{nullptr};
+    torch::Tensor mask;
+};
+TORCH_MODULE(CausalSelfAttention);
+
+struct MLPImpl : torch::nn::Module {
+    MLPImpl(const Config& cfg) {
+        fc = register_module("fc", torch::nn::Linear(cfg.n_embed, 4 * cfg.n_embed));
+        act = torch::nn::GELU();
+        proj = register_module("proj", torch::nn::Linear(4 * cfg.n_embed, cfg.n_embed));
+    }
+    torch::Tensor forward(torch::Tensor x) {
+        x = fc->forward(x);
+        x = act(x);
+        return proj->forward(x);
+    }
+    torch::nn::Linear fc{nullptr}, proj{nullptr};
+    torch::nn::GELU act;
+};
+TORCH_MODULE(MLP);
+
+struct BlockImpl : torch::nn::Module {
+    BlockImpl(const Config& cfg) {
+        ln1 = register_module("ln1", torch::nn::LayerNorm(torch::nn::LayerNormOptions({cfg.n_embed})));
+        attn = register_module("attn", CausalSelfAttention(cfg));
+        ln2 = register_module("ln2", torch::nn::LayerNorm(torch::nn::LayerNormOptions({cfg.n_embed})));
+        mlp = register_module("mlp", MLP(cfg));
+    }
+    torch::Tensor forward(torch::Tensor x) {
+        x = x + attn->forward(ln1->forward(x));
+        x = x + mlp->forward(ln2->forward(x));
+        return x;
+    }
+    torch::nn::LayerNorm ln1{nullptr}, ln2{nullptr};
+    CausalSelfAttention attn{nullptr};
+    MLP mlp{nullptr};
+};
+TORCH_MODULE(Block);
+
+struct GPTImpl : torch::nn::Module {
+    GPTImpl(const Config& cfg) {
+        wte = register_module("wte", torch::nn::Embedding(cfg.vocab_size, cfg.n_embed));
+        wpe = register_module("wpe", torch::nn::Embedding(cfg.block_size, cfg.n_embed));
+        for (int i = 0; i < cfg.n_layer; ++i) blocks->push_back(Block(cfg));
+        register_module("blocks", blocks);
+        ln_f = register_module("ln_f", torch::nn::LayerNorm(torch::nn::LayerNormOptions({cfg.n_embed})));
+        lm_head = register_module("lm_head", torch::nn::Linear(torch::nn::LinearOptions(cfg.n_embed, cfg.vocab_size).bias(false)));
+        lm_head->weight = wte->weight;
+        apply([](torch::nn::Module& m) {
+            if (auto* L = m.as<torch::nn::Linear>())
+                torch::nn::init::normal_(L->weight, 0.0, 0.02);
+        });
+    }
+    std::pair<torch::Tensor, torch::Tensor> forward(torch::Tensor idx, torch::Tensor targets) {
+        auto B = idx.size(0), T = idx.size(1);
+        auto pos = torch::arange(0, T, torch::kLong).to(idx.device());
+        auto x = wte->forward(idx) + wpe->forward(pos);
+        for (auto& m : *blocks) x = m->as<Block>()->forward(x);
+        x = ln_f->forward(x);
+        auto logits = lm_head->forward(x);
+        torch::Tensor loss;
+        if (targets.defined()) {
+            loss = torch::nn::functional::cross_entropy(logits.view({-1, logits.size(-1)}), targets.view(-1));
+        }
+        return {logits, loss};
+    }
+    torch::nn::Embedding wte{nullptr}, wpe{nullptr};
+    torch::nn::ModuleList blocks;
+    torch::nn::LayerNorm ln_f{nullptr};
+    torch::nn::Linear lm_head{nullptr};
+};
+TORCH_MODULE(GPT);
+
+int main() {
+    Config cfg;
+    int64_t B = 64, T = cfg.block_size;
+    TextLoader loader(B, T);
+    auto device = torch::cuda::is_available() ? torch::kCUDA : torch::kCPU;
+    GPT model(cfg);
+    model->to(device);
+    torch::optim::AdamW opt(model->parameters(), 3e-4);
+    for (int i = 0; i < 50; ++i) {
+        auto [x, y] = loader.next_batch();
+        x = x.to(device);
+        y = y.to(device);
+        opt.zero_grad();
+        auto [logits, loss] = model->forward(x, y);
+        loss.backward();
+        opt.step();
+        std::cout << "step " << i << " loss " << loss.item<double>() << '\n';
+    }
+    return 0;
+}

From ccef61f8392533932d0c50c452cbabb9014c28c4 Mon Sep 17 00:00:00 2001
From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com>
Date: Tue, 8 Jul 2025 20:07:33 +0530
Subject: [PATCH 2/5] Create train.py

---
 GPT2/train.py | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 GPT2/train.py

diff --git a/GPT2/train.py b/GPT2/train.py
new file mode 100644
index 0000000..1b70899
--- /dev/null
+++ b/GPT2/train.py
@@ -0,0 +1,201 @@
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F 
+import tiktoken
+
+batch_size = 64 
+block_size = 256
+max_iters = 5000
+eval_interval = 500
+learning_rate = 3e-4
+device = "cuda" if torch.cuda.is_available() else "cpu"
+eval_iters = 200
+n_embed = 384
+n_head = 6
+n_layer = 6
+dropout = 0.2
+
+
+def load_data_fromfile(filename):
+    with open(filename,'r') as file:
+        text = file.read()
+
+    return text
+def tokenize(data):
+        encoder = tiktoken.get_encoding('gpt2')
+        tokens = encoder.encode(data)
+        return tokens
+class TextLoader:
+    def __init__(self, B, T):
+        self.B = B
+        self.T = T
+
+        data = load_data_fromfile('input.txt')
+        tokens = tokenize(data)
+        self.tokens = torch.tensor(tokens)
+        print(f"loaded {len(self.tokens)} tokens")
+        print(f"processing {(len(self.tokens)) // (B*T)} batches per epoch")
+        self.cur_pos = 0
+    def next_batch(self):
+        B,T = self.B, self.T
+        buffer = self.tokens[self.cur_pos : self.cur_pos + B*T+1]
+        token_tensor = buffer[:-1].view(B,T)
+        next_token_tensor = buffer[1:].view(B,T)
+
+        # move to next batch
+        self.cur_pos += B*T
+        if self.cur_pos  + B*T + 1 > len(self.tokens):
+            self.cur_pos = 0
+        return token_tensor, next_token_tensor
+
+@dataclass
+class config:
+    block_size: int = 1024 # max sequence length
+    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
+    n_layer: int = 12 # number of layers
+    n_head: int = 12 # number of heads
+    n_embed: int = 768 # embedding dimension
+class CausalSelfAttention(nn.Module):
+    def __init__(self,config):
+        super().__init__()
+        assert config.n_embed % config.n_head == 0 # should be divisible exactly
+        self.n_embed = config.n_embed
+        self.n_head = config.n_head
+        
+        # K, Q, V for all heads 
+        self.c_attention = nn.Linear(config.n_embed,3*config.n_embed)
+        # output projection
+
+        self.c_proj = nn.Linear(config.n_embed, config.n_embed)
+        self.c_proj.NANOGPT_SCALE_INIT = 1
+        # mask
+        self.register_buffer('bias',torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size))
+
+
+    def forward(self,x):
+        ''' 
+        nh -> number of heads
+        hs -> head size
+        C - > number of channels = nh*hs
+        '''
+        B,T,C = x.size() # batch_size, sequence_length, embedding dimensionality
+        qkv = self.c_attention(x)
+        query, key , value = qkv.split(self.n_embed, dim = 2)
+        key = key.view(B,T,self.n_head, C // self.n_head).transpose(1,2) # (B,nh,T,hs)
+        query = query.view(B,T,self.n_head, C//self.n_head).transpose(1,2)
+        value = value.view(B,T,self.n_head, C//self.n_head).transpose(1,2)
+        y = F.scaled_dot_product_attention(query, key, value, is_causal=True) # flash attention
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.c_proj(y)
+        return y
+
+class MLP(nn.Module):
+    def __init__(self,config):
+        super().__init__()
+        self.config = config
+        # fullly connected layer
+        self.c_fc = nn.Linear(config.n_embed,4*config.n_embed) # multiplied by 4 to increase dimensionality for modelling complex patterns
+        self.gelu = nn.GELU(approximate='tanh') # why tanh ?  -> smooth non linear activation, tanh so that we can have good efficiency without too much loss
+
+        self.c_proj = nn.Linear(4*config.n_embed, config.n_embed)
+        self.c_proj.NANOGPT_SCALE_INIT = 1
+    def forward(self,x):
+        x = self.c_fc(x)
+        # activation
+        x = self.gelu(x)
+        x = self.c_proj(x)
+
+        return x
+
+class Block(nn.Module):
+    def __init__(self,config):
+        super().__init__()
+        self.config = config
+
+        self.layer_norm_1 = nn.LayerNorm(config.n_embed)
+        self.attention = CausalSelfAttention(config)
+        self.layer_norm_2 = nn.LayerNorm(config.n_embed)
+        self.mlp = MLP(config)
+
+    def forward(self,x): # return x
+        x = x + self.attention(self.layer_norm_1(x))
+        x = x + self.mlp(self.layer_norm_2(x))
+        return x 
+class GPT(nn.Module):
+    def __init__(self,config):
+        super().__init__()
+        self.config = config
+
+
+        self.transformer = nn.ModuleDict(dict(
+            # weights of token embedding
+            wte = nn.Embedding(config.vocab_size,config.n_embed),
+            # weights of positional embeddings
+            wpe = nn.Embedding(config.block_size,config.n_embed),
+            # hidden layers in the transformers
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            # layer norm 
+            layernorm_f = nn.LayerNorm(config.n_embed),
+            ))  
+        self.lm_head = nn.Linear(config.n_embed,config.vocab_size, bias= False)
+        # weight sharing for wte and lm head
+        self.transformer.wte.weight = self.lm_head.weight
+
+        self.apply(self.__init__weights)
+
+    def __init__weights(self, module):
+        if isinstance(module, nn.Linear):
+            std = 0.02
+            if hasattr(module, 'NANOGPT_SCALE_INIT'):
+                std *= (2 * self.config.n_layer) ** -0.5 # 2 * because two blocks added : attention and mlp
+            torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module,nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
+    def forward(self,idx,targets = None):
+        B,T = idx.size()
+
+        assert T <= self.config.block_size
+        pos = torch.arange(0,T,dtype = torch.long, device = idx.device)
+        pos_emb = self.transformer.wpe(pos) # dims -> (_,T,n_embed) _ -> is broadcasted to match dims
+        tok_emb = self.transformer.wte(idx) # dims -> (B,T,n_embed)
+        x = tok_emb + pos_emb
+
+        for block in self.transformer.h:
+            x = block(x)
+        
+        x = self.transformer.layernorm_f(x)
+        logits = self.lm_head(x) # dims -> (B,T,vocab_size) used to project the token into vocabulary space
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))  # reshaping (B,T,vocab_size) -> (B*T,vocab_size)
+        # if targets is not None:
+
+        return logits,loss
+        
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = GPT(config())
+model.to(device)
+
+
+B,T = 4,32
+trainLoader = TextLoader(B,T)
+
+
+# logits, loss  = model(token_tensor,next_token_tensor)
+optimiser = torch.optim.AdamW(model.parameters(), lr = 3e-4)
+
+for i in range(50):
+    token_tensor, next_token_tensor  = trainLoader.next_batch()
+    token_tensor, next_token_tensor = token_tensor.to(device), next_token_tensor.to(device)
+    optimiser.zero_grad() #n always start with zero gradients
+    logits, loss = model(token_tensor,next_token_tensor)
+    loss.backward()
+    optimiser.step()
+    print(f"step : {i} , loss : {loss.item()}")
+
+print(loss)

From d6973aebd7b7ecfa3deebdab57d51bb14caa921a Mon Sep 17 00:00:00 2001
From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com>
Date: Tue, 8 Jul 2025 20:08:06 +0530
Subject: [PATCH 3/5] Create dataloader.py

---
 GPT2/dataloader.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 GPT2/dataloader.py

diff --git a/GPT2/dataloader.py b/GPT2/dataloader.py
new file mode 100644
index 0000000..bda25b1
--- /dev/null
+++ b/GPT2/dataloader.py
@@ -0,0 +1,33 @@
+import os
+import requests
+import tiktoken
+import numpy as np
+
+# download the tiny shakespeare dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
+if not os.path.exists(input_file_path):
+    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+    with open(input_file_path, 'w', encoding='utf-8') as f:
+        f.write(requests.get(data_url).text)
+
+with open(input_file_path, 'r', encoding='utf-8') as f:
+    data = f.read()
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+
+# encode with tiktoken gpt2 bpe
+enc = tiktoken.get_encoding("gpt2")
+train_ids = enc.encode_ordinary(train_data)
+val_ids = enc.encode_ordinary(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+
+# train.bin has 301,966 tokens
+# val.bin has 36,059 tokens

From 33954a9891bce5a13a0ae18df8a3b812200f10bc Mon Sep 17 00:00:00 2001
From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com>
Date: Tue, 8 Jul 2025 20:09:35 +0530
Subject: [PATCH 4/5] Update README.md

---
 GPT2/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPT2/README.md b/GPT2/README.md
index 5852f44..5a99e21 100644
--- a/GPT2/README.md
+++ b/GPT2/README.md
@@ -1 +1,5 @@
-Initial commit
+# Experiments with GPT2 architecture
+
+1. for `train.cpp` tokenize the data using tiktoken or use bins of tokenized data
+2. for `train.py` run data.py in the same directory as train.py and then run it.
+3. Use `uv` package manager for dependency handling 

From ecc028453ab83fe4206b1bf3d59663f29c7c9b4d Mon Sep 17 00:00:00 2001
From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com>
Date: Tue, 8 Jul 2025 20:13:20 +0530
Subject: [PATCH 5/5] Create requirements.txt

---
 GPT2/requirements.txt | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 GPT2/requirements.txt

diff --git a/GPT2/requirements.txt b/GPT2/requirements.txt
new file mode 100644
index 0000000..fce789a
--- /dev/null
+++ b/GPT2/requirements.txt
@@ -0,0 +1,7 @@
+uv 
+torch 
+pandas
+tiktoken
+mlflow
+wandb
+tqdm