From b04652661fa85918f9a64c97ae4664014331eaae Mon Sep 17 00:00:00 2001 From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:07:12 +0530 Subject: [PATCH 1/5] Create train.cpp --- GPT2/train.cpp | 153 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 GPT2/train.cpp diff --git a/GPT2/train.cpp b/GPT2/train.cpp new file mode 100644 index 0000000..67e8c42 --- /dev/null +++ b/GPT2/train.cpp @@ -0,0 +1,153 @@ +#include +#include +#include +#include +#include + +struct Config { + int64_t block_size = 256; + int64_t vocab_size = 50257; + int64_t n_layer = 6; + int64_t n_head = 6; + int64_t n_embed = 384; +}; + +struct TextLoader { + TextLoader(int64_t B, int64_t T) : B(B), T(T), cur_pos(0) { + std::ifstream in("/teamspace/studios/this_studio/input.txt"); + int64_t tok; + while (in >> tok) tokens.push_back(tok); + size_t N = tokens.size(); + std::cout << "loaded " << N << " tokens\n"; + std::cout << "processing " << (N / (B * T)) << " batches per epoch\n"; + } + std::pair next_batch() { + int64_t span = B * T + 1; + if (cur_pos + span > tokens.size()) cur_pos = 0; + auto start = tokens.begin() + cur_pos; + std::vector buf(start, start + span); + cur_pos += B * T; + auto data = torch::from_blob(buf.data(), {span}, torch::kInt64).clone(); + auto x = data.narrow(0, 0, B * T).view({B, T}); + auto y = data.narrow(0, 1, B * T).view({B, T}); + return {x, y}; + } + int64_t B, T, cur_pos; + std::vector tokens; +}; + +struct CausalSelfAttentionImpl : torch::nn::Module { + CausalSelfAttentionImpl(const Config& cfg) { + n_embed = cfg.n_embed; + n_head = cfg.n_head; + head_dim = n_embed / n_head; + qkv = register_module("qkv", torch::nn::Linear(n_embed, 3 * n_embed)); + proj = register_module("proj", torch::nn::Linear(n_embed, n_embed)); + auto m = torch::ones({cfg.block_size, cfg.block_size}, torch::kBool).tril().view({1, 1, cfg.block_size, cfg.block_size}); + mask = m; + register_buffer("mask", mask); + } + torch::Tensor forward(const torch::Tensor& x) { + auto B = x.size(0), T = x.size(1); + auto qkv_out = qkv->forward(x).view({B, T, 3, n_head, head_dim}); + auto q = qkv_out.select(2, 0).permute({0, 2, 1, 3}); + auto k = qkv_out.select(2, 1).permute({0, 2, 1, 3}); + auto v = qkv_out.select(2, 2).permute({0, 2, 1, 3}); + auto y = at::scaled_dot_product_attention(q, k, v, mask, 0.0, false); + auto out = y.permute({0, 2, 1, 3}).contiguous().view({B, T, n_embed}); + return proj->forward(out); + } + int64_t n_embed, n_head, head_dim; + torch::nn::Linear qkv{nullptr}, proj{nullptr}; + torch::Tensor mask; +}; +TORCH_MODULE(CausalSelfAttention); + +struct MLPImpl : torch::nn::Module { + MLPImpl(const Config& cfg) { + fc = register_module("fc", torch::nn::Linear(cfg.n_embed, 4 * cfg.n_embed)); + act = torch::nn::GELU(); + proj = register_module("proj", torch::nn::Linear(4 * cfg.n_embed, cfg.n_embed)); + } + torch::Tensor forward(torch::Tensor x) { + x = fc->forward(x); + x = act(x); + return proj->forward(x); + } + torch::nn::Linear fc{nullptr}, proj{nullptr}; + torch::nn::GELU act; +}; +TORCH_MODULE(MLP); + +struct BlockImpl : torch::nn::Module { + BlockImpl(const Config& cfg) { + ln1 = register_module("ln1", torch::nn::LayerNorm(torch::nn::LayerNormOptions({cfg.n_embed}))); + attn = register_module("attn", CausalSelfAttention(cfg)); + ln2 = register_module("ln2", torch::nn::LayerNorm(torch::nn::LayerNormOptions({cfg.n_embed}))); + mlp = register_module("mlp", MLP(cfg)); + } + torch::Tensor forward(torch::Tensor x) { + x = x + attn->forward(ln1->forward(x)); + x = x + mlp->forward(ln2->forward(x)); + return x; + } + torch::nn::LayerNorm ln1{nullptr}, ln2{nullptr}; + CausalSelfAttention attn{nullptr}; + MLP mlp{nullptr}; +}; +TORCH_MODULE(Block); + +struct GPTImpl : torch::nn::Module { + GPTImpl(const Config& cfg) { + wte = register_module("wte", torch::nn::Embedding(cfg.vocab_size, cfg.n_embed)); + wpe = register_module("wpe", torch::nn::Embedding(cfg.block_size, cfg.n_embed)); + for (int i = 0; i < cfg.n_layer; ++i) blocks->push_back(Block(cfg)); + register_module("blocks", blocks); + ln_f = register_module("ln_f", torch::nn::LayerNorm(torch::nn::LayerNormOptions({cfg.n_embed}))); + lm_head = register_module("lm_head", torch::nn::Linear(torch::nn::LinearOptions(cfg.n_embed, cfg.vocab_size).bias(false))); + lm_head->weight = wte->weight; + apply([](torch::nn::Module& m) { + if (auto* L = m.as()) + torch::nn::init::normal_(L->weight, 0.0, 0.02); + }); + } + std::pair forward(torch::Tensor idx, torch::Tensor targets) { + auto B = idx.size(0), T = idx.size(1); + auto pos = torch::arange(0, T, torch::kLong).to(idx.device()); + auto x = wte->forward(idx) + wpe->forward(pos); + for (auto& m : *blocks) x = m->as()->forward(x); + x = ln_f->forward(x); + auto logits = lm_head->forward(x); + torch::Tensor loss; + if (targets.defined()) { + loss = torch::nn::functional::cross_entropy(logits.view({-1, logits.size(-1)}), targets.view(-1)); + } + return {logits, loss}; + } + torch::nn::Embedding wte{nullptr}, wpe{nullptr}; + torch::nn::ModuleList blocks; + torch::nn::LayerNorm ln_f{nullptr}; + torch::nn::Linear lm_head{nullptr}; +}; +TORCH_MODULE(GPT); + +int main() { + Config cfg; + int64_t B = 64, T = cfg.block_size; + TextLoader loader(B, T); + auto device = torch::cuda::is_available() ? torch::kCUDA : torch::kCPU; + GPT model(cfg); + model->to(device); + torch::optim::AdamW opt(model->parameters(), 3e-4); + for (int i = 0; i < 50; ++i) { + auto [x, y] = loader.next_batch(); + x = x.to(device); + y = y.to(device); + opt.zero_grad(); + auto [logits, loss] = model->forward(x, y); + loss.backward(); + opt.step(); + std::cout << "step " << i << " loss " << loss.item() << '\n'; + } + return 0; +} From ccef61f8392533932d0c50c452cbabb9014c28c4 Mon Sep 17 00:00:00 2001 From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:07:33 +0530 Subject: [PATCH 2/5] Create train.py --- GPT2/train.py | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 GPT2/train.py diff --git a/GPT2/train.py b/GPT2/train.py new file mode 100644 index 0000000..1b70899 --- /dev/null +++ b/GPT2/train.py @@ -0,0 +1,201 @@ +from dataclasses import dataclass +import torch +import torch.nn as nn +from torch.nn import functional as F +import tiktoken + +batch_size = 64 +block_size = 256 +max_iters = 5000 +eval_interval = 500 +learning_rate = 3e-4 +device = "cuda" if torch.cuda.is_available() else "cpu" +eval_iters = 200 +n_embed = 384 +n_head = 6 +n_layer = 6 +dropout = 0.2 + + +def load_data_fromfile(filename): + with open(filename,'r') as file: + text = file.read() + + return text +def tokenize(data): + encoder = tiktoken.get_encoding('gpt2') + tokens = encoder.encode(data) + return tokens +class TextLoader: + def __init__(self, B, T): + self.B = B + self.T = T + + data = load_data_fromfile('input.txt') + tokens = tokenize(data) + self.tokens = torch.tensor(tokens) + print(f"loaded {len(self.tokens)} tokens") + print(f"processing {(len(self.tokens)) // (B*T)} batches per epoch") + self.cur_pos = 0 + def next_batch(self): + B,T = self.B, self.T + buffer = self.tokens[self.cur_pos : self.cur_pos + B*T+1] + token_tensor = buffer[:-1].view(B,T) + next_token_tensor = buffer[1:].view(B,T) + + # move to next batch + self.cur_pos += B*T + if self.cur_pos + B*T + 1 > len(self.tokens): + self.cur_pos = 0 + return token_tensor, next_token_tensor + +@dataclass +class config: + block_size: int = 1024 # max sequence length + vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token + n_layer: int = 12 # number of layers + n_head: int = 12 # number of heads + n_embed: int = 768 # embedding dimension +class CausalSelfAttention(nn.Module): + def __init__(self,config): + super().__init__() + assert config.n_embed % config.n_head == 0 # should be divisible exactly + self.n_embed = config.n_embed + self.n_head = config.n_head + + # K, Q, V for all heads + self.c_attention = nn.Linear(config.n_embed,3*config.n_embed) + # output projection + + self.c_proj = nn.Linear(config.n_embed, config.n_embed) + self.c_proj.NANOGPT_SCALE_INIT = 1 + # mask + self.register_buffer('bias',torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size)) + + + def forward(self,x): + ''' + nh -> number of heads + hs -> head size + C - > number of channels = nh*hs + ''' + B,T,C = x.size() # batch_size, sequence_length, embedding dimensionality + qkv = self.c_attention(x) + query, key , value = qkv.split(self.n_embed, dim = 2) + key = key.view(B,T,self.n_head, C // self.n_head).transpose(1,2) # (B,nh,T,hs) + query = query.view(B,T,self.n_head, C//self.n_head).transpose(1,2) + value = value.view(B,T,self.n_head, C//self.n_head).transpose(1,2) + y = F.scaled_dot_product_attention(query, key, value, is_causal=True) # flash attention + y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side + # output projection + y = self.c_proj(y) + return y + +class MLP(nn.Module): + def __init__(self,config): + super().__init__() + self.config = config + # fullly connected layer + self.c_fc = nn.Linear(config.n_embed,4*config.n_embed) # multiplied by 4 to increase dimensionality for modelling complex patterns + self.gelu = nn.GELU(approximate='tanh') # why tanh ? -> smooth non linear activation, tanh so that we can have good efficiency without too much loss + + self.c_proj = nn.Linear(4*config.n_embed, config.n_embed) + self.c_proj.NANOGPT_SCALE_INIT = 1 + def forward(self,x): + x = self.c_fc(x) + # activation + x = self.gelu(x) + x = self.c_proj(x) + + return x + +class Block(nn.Module): + def __init__(self,config): + super().__init__() + self.config = config + + self.layer_norm_1 = nn.LayerNorm(config.n_embed) + self.attention = CausalSelfAttention(config) + self.layer_norm_2 = nn.LayerNorm(config.n_embed) + self.mlp = MLP(config) + + def forward(self,x): # return x + x = x + self.attention(self.layer_norm_1(x)) + x = x + self.mlp(self.layer_norm_2(x)) + return x +class GPT(nn.Module): + def __init__(self,config): + super().__init__() + self.config = config + + + self.transformer = nn.ModuleDict(dict( + # weights of token embedding + wte = nn.Embedding(config.vocab_size,config.n_embed), + # weights of positional embeddings + wpe = nn.Embedding(config.block_size,config.n_embed), + # hidden layers in the transformers + h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), + # layer norm + layernorm_f = nn.LayerNorm(config.n_embed), + )) + self.lm_head = nn.Linear(config.n_embed,config.vocab_size, bias= False) + # weight sharing for wte and lm head + self.transformer.wte.weight = self.lm_head.weight + + self.apply(self.__init__weights) + + def __init__weights(self, module): + if isinstance(module, nn.Linear): + std = 0.02 + if hasattr(module, 'NANOGPT_SCALE_INIT'): + std *= (2 * self.config.n_layer) ** -0.5 # 2 * because two blocks added : attention and mlp + torch.nn.init.normal_(module.weight, mean = 0.0, std = std) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module,nn.Embedding): + torch.nn.init.normal_(module.weight, mean = 0.0, std = std) + def forward(self,idx,targets = None): + B,T = idx.size() + + assert T <= self.config.block_size + pos = torch.arange(0,T,dtype = torch.long, device = idx.device) + pos_emb = self.transformer.wpe(pos) # dims -> (_,T,n_embed) _ -> is broadcasted to match dims + tok_emb = self.transformer.wte(idx) # dims -> (B,T,n_embed) + x = tok_emb + pos_emb + + for block in self.transformer.h: + x = block(x) + + x = self.transformer.layernorm_f(x) + logits = self.lm_head(x) # dims -> (B,T,vocab_size) used to project the token into vocabulary space + loss = None + if targets is not None: + loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1)) # reshaping (B,T,vocab_size) -> (B*T,vocab_size) + # if targets is not None: + + return logits,loss + + +device = 'cuda' if torch.cuda.is_available() else 'cpu' +model = GPT(config()) +model.to(device) + + +B,T = 4,32 +trainLoader = TextLoader(B,T) + + +# logits, loss = model(token_tensor,next_token_tensor) +optimiser = torch.optim.AdamW(model.parameters(), lr = 3e-4) + +for i in range(50): + token_tensor, next_token_tensor = trainLoader.next_batch() + token_tensor, next_token_tensor = token_tensor.to(device), next_token_tensor.to(device) + optimiser.zero_grad() #n always start with zero gradients + logits, loss = model(token_tensor,next_token_tensor) + loss.backward() + optimiser.step() + print(f"step : {i} , loss : {loss.item()}") + +print(loss) From d6973aebd7b7ecfa3deebdab57d51bb14caa921a Mon Sep 17 00:00:00 2001 From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:08:06 +0530 Subject: [PATCH 3/5] Create dataloader.py --- GPT2/dataloader.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 GPT2/dataloader.py diff --git a/GPT2/dataloader.py b/GPT2/dataloader.py new file mode 100644 index 0000000..bda25b1 --- /dev/null +++ b/GPT2/dataloader.py @@ -0,0 +1,33 @@ +import os +import requests +import tiktoken +import numpy as np + +# download the tiny shakespeare dataset +input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') +if not os.path.exists(input_file_path): + data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' + with open(input_file_path, 'w', encoding='utf-8') as f: + f.write(requests.get(data_url).text) + +with open(input_file_path, 'r', encoding='utf-8') as f: + data = f.read() +n = len(data) +train_data = data[:int(n*0.9)] +val_data = data[int(n*0.9):] + +# encode with tiktoken gpt2 bpe +enc = tiktoken.get_encoding("gpt2") +train_ids = enc.encode_ordinary(train_data) +val_ids = enc.encode_ordinary(val_data) +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) + +# train.bin has 301,966 tokens +# val.bin has 36,059 tokens From 33954a9891bce5a13a0ae18df8a3b812200f10bc Mon Sep 17 00:00:00 2001 From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:09:35 +0530 Subject: [PATCH 4/5] Update README.md --- GPT2/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/GPT2/README.md b/GPT2/README.md index 5852f44..5a99e21 100644 --- a/GPT2/README.md +++ b/GPT2/README.md @@ -1 +1,5 @@ -Initial commit +# Experiments with GPT2 architecture + +1. for `train.cpp` tokenize the data using tiktoken or use bins of tokenized data +2. for `train.py` run data.py in the same directory as train.py and then run it. +3. Use `uv` package manager for dependency handling From ecc028453ab83fe4206b1bf3d59663f29c7c9b4d Mon Sep 17 00:00:00 2001 From: ChinmayK0607 <114411195+ChinmayK0607@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:13:20 +0530 Subject: [PATCH 5/5] Create requirements.txt --- GPT2/requirements.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 GPT2/requirements.txt diff --git a/GPT2/requirements.txt b/GPT2/requirements.txt new file mode 100644 index 0000000..fce789a --- /dev/null +++ b/GPT2/requirements.txt @@ -0,0 +1,7 @@ +uv +torch +pandas +tiktoken +mlflow +wandb +tqdm