diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d1b8cef11277d..7c8bad15fe976 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -56,7 +56,7 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_transformer_layers"]) @property @abstractmethod @@ -2903,6 +2903,104 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +@Model.register("OpenELMForCausalLM") +class OpenELM(Model): + model_arch = gguf.MODEL_ARCH.OPENELM + def set_gguf_parameters(self): + self.gguf_writer.add_name("OpenElm") + self.block_count = self.find_hparam(["num_transformer_layers"]) + self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_rms_eps(1e-6) # https://github.com/apple/corenet/blob/0333b1fbb29c31809663c4e6de2654b9ff2d27de/mlx_examples/open_elm/open_elm.py#L20 + n_embd = self.find_hparam(["model_dim"]) + self.gguf_writer.add_embedding_length(n_embd) + head_dim = self.find_hparam(["head_dim"]) + n_head = n_embd // head_dim + rot_pct = 1.0 + self.gguf_writer.add_context_length(self.find_hparam(["max_context_length"])) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count_kv(n_head*10) + self.gguf_writer.add_head_count(n_head*10) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_feed_forward_length(0) # dynamically calculated + + def set_vocab(self): + from sentencepiece import SentencePieceProcessor + tokenizer_path = self.dir_model / 'tokenizer.model' + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def write_tensors(self): + block_count = self.hparams.get("num_transformer_layers", self.hparams.get("num_hidden_layers", self.hparams.get("num_transformer_layers"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + data = data_torch.numpy() + data = data.squeeze() + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + new_name += ".weight" + n_dims = len(data.shape) + data_dtype = data.dtype + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + # 1d tensors need to be converted to float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2: + data = data.astype(np.float16) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + ###### CONVERSION LOGIC ###### diff --git a/debug.openelm-2.txt b/debug.openelm-2.txt new file mode 100644 index 0000000000000..cc366d06a8e6d --- /dev/null +++ b/debug.openelm-2.txt @@ -0,0 +1,5412 @@ +token_embd.weight{1280, 32000, 1, 1} n=1280 0.002177 + -0.0189 -0.0190 -0.0190..., -0.0456 -0.0456 -0.0457 +inp_tokens{7, 1, 1, 1} n=1280 0.000000 + 1.0000 9038.0000 2501.0000..., 931.0000 727.0000 471.0000 +GET_ROWS == +inp_embd{1280, 7, 1, 1} n=1280 3.191812 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +inp_embd{1280, 7, 1, 1} n=1280 3.191812 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +RMS_NORM == +norm-0{1280, 7, 1, 1} n=1280 75.213745 + 0.0010 1.0010 2.0010..., 1277.0010 1278.0010 1279.0010 + + +norm-0{1280, 7, 1, 1} n=1280 0.822055 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +blk.0.attn_norm.weight{1280, 1, 1, 1} n=1280 48.488178 + 0.0379 1.0379 2.0379..., 1277.0380 1278.0380 1279.0380 +MUL == +attn_norm-0{1280, 7, 1, 1} n=1280 0.822055 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +blk.0.attn_qkv.weight{1280, 1152, 1, 1} n=1280 0.000000 + -0.0391 -0.0392 -0.0392..., -0.0937 -0.0938 -0.0938 +attn_norm-0{1280, 7, 1, 1} n=1280 0.822055 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +MUL_MAT == +wqkv-0{1152, 7, 1, 1} n=1280 -3.620103 + 0.0532 1.0532 2.0532..., 1149.0532 1150.0532 1151.0532 + + +wqkv-0{1152, 7, 1, 1} n=1152 -3.620103 + 0.0532 1.0532 2.0532..., 1149.0532 1150.0532 1151.0532 +VIEW == +wqkv-0 (view){64, 7, 12, 1} n=1152 -0.362447 + 0.0532 1.0532 2.0532..., 61.0532 62.0532 63.0532 + + +wqkv-0 (view){64, 7, 12, 1} n=64 -0.362447 + 0.0532 1.0532 2.0532..., 61.0532 62.0532 63.0532 +CONT == +Qcur-0{64, 7, 12, 1} n=64 -0.362447 + 0.0532 1.0532 2.0532..., 61.0532 62.0532 63.0532 + + +Qcur-0{64, 7, 12, 1} n=64 -5.291815 + 0.7767 1.7767 2.7767..., 61.7767 62.7767 63.7767 +RMS_NORM == +norm-0{64, 7, 12, 1} n=64 -5.291815 + 0.7767 1.7767 2.7767..., 61.7767 62.7767 63.7767 + + +norm-0{64, 7, 12, 1} n=64 1.357524 + 0.8096 1.8096 2.8096..., 61.8096 62.8096 63.8096 +blk.0.attn_q_norm.weight{64, 1, 1, 1} n=64 115.871346 + 1.0423 2.0423 3.0423..., 62.0423 63.0423 64.0423 +MUL == +Qcur-0{64, 7, 12, 1} n=64 1.357524 + 0.8096 1.8096 2.8096..., 61.8096 62.8096 63.8096 + + +Qcur-0{64, 7, 12, 1} n=64 1.357524 + 0.8096 1.8096 2.8096..., 61.8096 62.8096 63.8096 +RESHAPE == +Qcur-0 (reshaped){64, 12, 7, 1} n=64 1.357524 + 0.8096 1.8096 2.8096..., 61.8096 62.8096 63.8096 + + +Qcur-0 (reshaped){64, 12, 7, 1} n=64 1.357524 + 0.8096 1.8096 2.8096..., 61.8096 62.8096 63.8096 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-0{64, 12, 7, 1} n=64 1.357524 + 0.8096 1.8096 2.8096..., 61.8096 62.8096 63.8096 + + +Qcur-0{64, 12, 7, 1} n=64 0.169690 + 0.1012 1.1012 2.1012..., 61.1012 62.1012 63.1012 +SCALE == +Qcur-0{64, 12, 7, 1} n=64 0.169690 + 0.1012 1.1012 2.1012..., 61.1012 62.1012 63.1012 + + +wqkv-0{1152, 7, 1, 1} n=1152 -3.620103 + 0.0532 1.0532 2.0532..., 1149.0532 1150.0532 1151.0532 +VIEW == +wqkv-0 (view){64, 7, 3, 1} n=1152 -0.782018 + -0.0229 0.9771 1.9771..., 60.9771 61.9771 62.9771 + + +wqkv-0 (view){64, 7, 3, 1} n=64 -0.782018 + -0.0229 0.9771 1.9771..., 60.9771 61.9771 62.9771 +CONT == +Kcur-0{64, 7, 3, 1} n=64 -0.782018 + -0.0229 0.9771 1.9771..., 60.9771 61.9771 62.9771 + + +Kcur-0{64, 7, 3, 1} n=64 -12.882872 + -0.3779 0.6221 1.6221..., 60.6221 61.6221 62.6221 +RMS_NORM == +norm-0{64, 7, 3, 1} n=64 -12.882872 + -0.3779 0.6221 1.6221..., 60.6221 61.6221 62.6221 + + +norm-0{64, 7, 3, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 +blk.0.attn_k_norm.weight{64, 1, 1, 1} n=64 115.038528 + 1.1178 2.1178 3.1178..., 62.1178 63.1178 64.1178 +MUL == +Kcur-0{64, 7, 3, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 + + +Kcur-0{64, 7, 3, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 +RESHAPE == +Kcur-0 (reshaped){64, 3, 7, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 + + +Kcur-0 (reshaped){64, 3, 7, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_16{64, 3, 7, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 + + +node_16{64, 3, 7, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 +REPEAT == +node_17{64, 3, 28, 1} n=64 -13.134807 + -0.4225 0.5775 1.5775..., 60.5775 61.5775 62.5775 + + +wqkv-0{1152, 7, 1, 1} n=1152 -3.620103 + 0.0532 1.0532 2.0532..., 1149.0532 1150.0532 1151.0532 +VIEW == +wqkv-0 (view){64, 7, 3, 1} n=1152 -466074866721184345337607749632.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-0 (view){64, 7, 3, 1} n=64 -466074866721184345337607749632.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-0{64, 7, 3, 1} n=64 -466074866721184345337607749632.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-0{64, 7, 3, 1} n=64 -466074866721184345337607749632.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_20{64, 7, 12, 1} n=64 -466074866721184345337607749632.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l0{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-0{32, 64, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l0{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-0{64, 32, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-0{64, 12, 7, 1} n=64 0.169690 + 0.1012 1.1012 2.1012..., 61.1012 62.1012 63.1012 +PERMUTE == +q-0{64, 7, 12, 1} n=64 0.169690 + 0.1012 1.1012 2.1012..., 61.1012 62.1012 63.1012 + + +k-0{64, 32, 6, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-0{64, 7, 12, 1} n=64 0.169690 + 0.1012 1.1012 2.1012..., 61.1012 62.1012 63.1012 +MUL_MAT == +kq-0{32, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-0{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-0{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-0{32, 64, 6, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-0{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-0{64, 7, 12, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-0{64, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-0{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-0{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-0{768, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 + + +blk.0.attn_output.weight{768, 1280, 1, 1} n=768 -0.000000 + 0.0309 0.0309 0.0309..., 0.0539 0.0540 0.0540 +kqv_merged_cont-0{768, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 +MUL_MAT == +kqv_out-0{1280, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-0{1280, 7, 1, 1} n=1280 3.191812 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +inp_embd{1280, 7, 1, 1} n=1280 3.191812 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +ADD == +node_30{1280, 7, 1, 1} n=1280 3.191812 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +node_30{1280, 7, 1, 1} n=1280 3.191812 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +RMS_NORM == +norm-0{1280, 7, 1, 1} n=1280 75.213745 + 0.0010 1.0010 2.0010..., 1277.0010 1278.0010 1279.0010 + + +norm-0{1280, 7, 1, 1} n=1280 23.822704 + 0.0001 1.0001 2.0001..., 1277.0001 1278.0001 1279.0001 +blk.0.ffn_norm.weight{1280, 1, 1, 1} n=1280 195.619263 + 0.1323 1.1323 2.1323..., 1277.1323 1278.1323 1279.1323 +MUL == +ffn_norm-0{1280, 7, 1, 1} n=1280 23.822704 + 0.0001 1.0001 2.0001..., 1277.0001 1278.0001 1279.0001 + + +ffn_norm-0{1280, 7, 1, 1} n=1280 79.960640 + 0.0004 1.0004 2.0004..., 1277.0005 1278.0005 1279.0005 +RMS_NORM == +norm-0{1280, 7, 1, 1} n=1280 79.960640 + 0.0004 1.0004 2.0004..., 1277.0005 1278.0005 1279.0005 + + +norm-0{1280, 7, 1, 1} n=1280 28.960400 + 0.0001 1.0001 2.0001..., 1277.0000 1278.0000 1279.0000 +blk.0.ffn_norm.weight{1280, 1, 1, 1} n=1280 195.619263 + 0.1323 1.1323 2.1323..., 1277.1323 1278.1323 1279.1323 +MUL == +ffn_norm-0{1280, 7, 1, 1} n=1280 28.960400 + 0.0001 1.0001 2.0001..., 1277.0000 1278.0000 1279.0000 + + +blk.0.ffn_up.weight{1280, 1536, 1, 1} n=1280 0.000000 + 0.0213 0.0214 0.0214..., 0.0504 0.0504 0.0505 +ffn_norm-0{1280, 7, 1, 1} n=1280 28.960400 + 0.0001 1.0001 2.0001..., 1277.0000 1278.0000 1279.0000 +MUL_MAT == +ffn_silu-0{1536, 7, 1, 1} n=1280 -5.243989 + -0.2163 0.7837 1.7837..., 1532.7837 1533.7837 1534.7837 + + +ffn_silu-0{1536, 7, 1, 1} n=1536 -5.243989 + -0.2163 0.7837 1.7837..., 1532.7837 1533.7837 1534.7837 +VIEW == +ffn_up-0 (view){768, 7, 1, 1} n=1536 -2.134992 + -0.2163 0.7837 1.7837..., 764.7837 765.7837 766.7837 + + +ffn_up-0 (view){768, 7, 1, 1} n=768 -2.134992 + -0.2163 0.7837 1.7837..., 764.7837 765.7837 766.7837 +CONT == +ffn_up-0 (view) (cont){768, 7, 1, 1} n=768 -2.134992 + -0.2163 0.7837 1.7837..., 764.7837 765.7837 766.7837 + + +ffn_up-0 (view) (cont){768, 7, 1, 1} n=768 19.475294 + -0.0965 0.9035 1.9035..., 764.9035 765.9035 766.9035 +SILU == +node_38{768, 7, 1, 1} n=768 19.475294 + -0.0965 0.9035 1.9035..., 764.9035 765.9035 766.9035 + + +ffn_silu-0{1536, 7, 1, 1} n=1536 -5.243989 + -0.2163 0.7837 1.7837..., 1532.7837 1533.7837 1534.7837 +VIEW == +ffn_up-0 (view){768, 7, 1, 1} n=1536 -3.108997 + -0.0993 0.9007 1.9007..., 764.9006 765.9006 766.9006 + + +ffn_up-0 (view){768, 7, 1, 1} n=768 -3.108997 + -0.0993 0.9007 1.9007..., 764.9006 765.9006 766.9006 +CONT == +ffn_up-0 (view) (cont){768, 7, 1, 1} n=768 -3.108997 + -0.0993 0.9007 1.9007..., 764.9006 765.9006 766.9006 + + +node_38{768, 7, 1, 1} n=768 -4.165717 + 0.0096 1.0096 2.0096..., 765.0096 766.0096 767.0096 +ffn_up-0 (view) (cont){768, 7, 1, 1} n=768 -3.108997 + -0.0993 0.9007 1.9007..., 764.9006 765.9006 766.9006 +MUL == +ffn_mul-0{768, 7, 1, 1} n=768 -4.165717 + 0.0096 1.0096 2.0096..., 765.0096 766.0096 767.0096 + + +blk.0.ffn_down.weight{768, 1280, 1, 1} n=768 0.000000 + 0.0099 0.0099 0.0099..., 0.0158 0.0158 0.0158 +ffn_mul-0{768, 7, 1, 1} n=768 -4.165717 + 0.0096 1.0096 2.0096..., 765.0096 766.0096 767.0096 +MUL_MAT == +ffn_out-0{1280, 7, 1, 1} n=768 0.533009 + -0.0787 0.9213 1.9213..., 1276.9213 1277.9213 1278.9213 + + +node_30{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 +ffn_out-0{1280, 7, 1, 1} n=1280 0.533009 + -0.0787 0.9213 1.9213..., 1276.9213 1277.9213 1278.9213 +ADD == +l_out-0{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 + + +l_out-0{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 +RMS_NORM == +norm-1{1280, 7, 1, 1} n=1280 20.955381 + -0.4425 0.5575 1.5575..., 1276.5575 1277.5575 1278.5575 + + +norm-1{1280, 7, 1, 1} n=1280 -0.879155 + -0.0513 0.9487 1.9487..., 1276.9487 1277.9487 1278.9487 +blk.1.attn_norm.weight{1280, 1, 1, 1} n=1280 57.100716 + 0.1159 1.1159 2.1159..., 1277.1158 1278.1158 1279.1158 +MUL == +attn_norm-1{1280, 7, 1, 1} n=1280 -0.879155 + -0.0513 0.9487 1.9487..., 1276.9487 1277.9487 1278.9487 + + +blk.1.attn_qkv.weight{1280, 1152, 1, 1} n=1280 -0.000000 + 0.0368 0.0369 0.0369..., 0.0891 0.0892 0.0892 +attn_norm-1{1280, 7, 1, 1} n=1280 -0.879155 + -0.0513 0.9487 1.9487..., 1276.9487 1277.9487 1278.9487 +MUL_MAT == +wqkv-1{1152, 7, 1, 1} n=1280 2.436259 + 0.3849 1.3849 2.3849..., 1149.3849 1150.3849 1151.3849 + + +wqkv-1{1152, 7, 1, 1} n=1152 2.436259 + 0.3849 1.3849 2.3849..., 1149.3849 1150.3849 1151.3849 +VIEW == +wqkv-1 (view){64, 7, 12, 1} n=1152 1.325685 + 0.3849 1.3849 2.3849..., 61.3849 62.3849 63.3849 + + +wqkv-1 (view){64, 7, 12, 1} n=64 1.325685 + 0.3849 1.3849 2.3849..., 61.3849 62.3849 63.3849 +CONT == +Qcur-1{64, 7, 12, 1} n=64 1.325685 + 0.3849 1.3849 2.3849..., 61.3849 62.3849 63.3849 + + +Qcur-1{64, 7, 12, 1} n=64 2.192594 + 0.6366 1.6366 2.6366..., 61.6366 62.6366 63.6366 +RMS_NORM == +norm-1{64, 7, 12, 1} n=64 2.192594 + 0.6366 1.6366 2.6366..., 61.6366 62.6366 63.6366 + + +norm-1{64, 7, 12, 1} n=64 36.712757 + 0.5734 1.5734 2.5734..., 61.5734 62.5734 63.5734 +blk.1.attn_q_norm.weight{64, 1, 1, 1} n=64 93.896156 + 0.9007 1.9007 2.9007..., 61.9007 62.9007 63.9007 +MUL == +Qcur-1{64, 7, 12, 1} n=64 36.712757 + 0.5734 1.5734 2.5734..., 61.5734 62.5734 63.5734 + + +Qcur-1{64, 7, 12, 1} n=64 36.712757 + 0.5734 1.5734 2.5734..., 61.5734 62.5734 63.5734 +RESHAPE == +Qcur-1 (reshaped){64, 12, 7, 1} n=64 36.712757 + 0.5734 1.5734 2.5734..., 61.5734 62.5734 63.5734 + + +Qcur-1 (reshaped){64, 12, 7, 1} n=64 36.712757 + 0.5734 1.5734 2.5734..., 61.5734 62.5734 63.5734 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-1{64, 12, 7, 1} n=64 36.712757 + 0.5734 1.5734 2.5734..., 61.5734 62.5734 63.5734 + + +Qcur-1{64, 12, 7, 1} n=64 4.589095 + 0.0717 1.0717 2.0717..., 61.0717 62.0717 63.0717 +SCALE == +Qcur-1{64, 12, 7, 1} n=64 4.589095 + 0.0717 1.0717 2.0717..., 61.0717 62.0717 63.0717 + + +wqkv-1{1152, 7, 1, 1} n=1152 2.436259 + 0.3849 1.3849 2.3849..., 1149.3849 1150.3849 1151.3849 +VIEW == +wqkv-1 (view){64, 7, 3, 1} n=1152 0.260322 + -0.0072 0.9928 1.9928..., 60.9928 61.9928 62.9928 + + +wqkv-1 (view){64, 7, 3, 1} n=64 0.260322 + -0.0072 0.9928 1.9928..., 60.9928 61.9928 62.9928 +CONT == +Kcur-1{64, 7, 3, 1} n=64 0.260322 + -0.0072 0.9928 1.9928..., 60.9928 61.9928 62.9928 + + +Kcur-1{64, 7, 3, 1} n=64 0.506658 + -0.0140 0.9860 1.9860..., 60.9860 61.9860 62.9860 +RMS_NORM == +norm-1{64, 7, 3, 1} n=64 0.506658 + -0.0140 0.9860 1.9860..., 60.9860 61.9860 62.9860 + + +norm-1{64, 7, 3, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 +blk.1.attn_k_norm.weight{64, 1, 1, 1} n=64 87.629463 + 0.9478 1.9478 2.9478..., 61.9478 62.9478 63.9478 +MUL == +Kcur-1{64, 7, 3, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 + + +Kcur-1{64, 7, 3, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 +RESHAPE == +Kcur-1 (reshaped){64, 3, 7, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 + + +Kcur-1 (reshaped){64, 3, 7, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_59{64, 3, 7, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 + + +node_59{64, 3, 7, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 +REPEAT == +node_60{64, 3, 28, 1} n=64 20.098293 + -0.0132 0.9868 1.9868..., 60.9868 61.9868 62.9868 + + +wqkv-1{1152, 7, 1, 1} n=1152 2.436259 + 0.3849 1.3849 2.3849..., 1149.3849 1150.3849 1151.3849 +VIEW == +wqkv-1 (view){64, 7, 3, 1} n=1152 -108730194697373351936.000000 + -0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-1 (view){64, 7, 3, 1} n=64 -108730194697373351936.000000 + -0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-1{64, 7, 3, 1} n=64 -108730194697373351936.000000 + -0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-1{64, 7, 3, 1} n=64 -108730194697373351936.000000 + -0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_63{64, 7, 12, 1} n=64 -108730194697373351936.000000 + -0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l1{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-1{32, 64, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l1{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-1{64, 32, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-1{64, 12, 7, 1} n=64 4.589095 + 0.0717 1.0717 2.0717..., 61.0717 62.0717 63.0717 +PERMUTE == +q-1{64, 7, 12, 1} n=64 4.589095 + 0.0717 1.0717 2.0717..., 61.0717 62.0717 63.0717 + + +k-1{64, 32, 6, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-1{64, 7, 12, 1} n=64 4.589095 + 0.0717 1.0717 2.0717..., 61.0717 62.0717 63.0717 +MUL_MAT == +kq-1{32, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-1{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-1{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-1{32, 64, 6, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-1{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-1{64, 7, 12, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-1{64, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-1{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-1{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-1{768, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 + + +blk.1.attn_output.weight{768, 1280, 1, 1} n=768 0.000005 + 0.2372 0.2373 0.2374..., 0.4111 0.4114 0.4116 +kqv_merged_cont-1{768, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 +MUL_MAT == +kqv_out-1{1280, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-1{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 +l_out-0{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 +ADD == +node_73{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 + + +node_73{1280, 7, 1, 1} n=1280 3.724820 + -0.0787 0.9213 1.9213..., 1276.9214 1277.9214 1278.9214 +RMS_NORM == +norm-1{1280, 7, 1, 1} n=1280 20.955381 + -0.4425 0.5575 1.5575..., 1276.5575 1277.5575 1278.5575 + + +norm-1{1280, 7, 1, 1} n=1280 11.528000 + 0.0483 1.0483 2.0483..., 1277.0483 1278.0483 1279.0483 +blk.1.ffn_norm.weight{1280, 1, 1, 1} n=1280 285.088928 + -0.1091 0.8909 1.8909..., 1276.8909 1277.8909 1278.8909 +MUL == +ffn_norm-1{1280, 7, 1, 1} n=1280 11.528000 + 0.0483 1.0483 2.0483..., 1277.0483 1278.0483 1279.0483 + + +ffn_norm-1{1280, 7, 1, 1} n=1280 48.424751 + 0.2028 1.2028 2.2028..., 1277.2029 1278.2029 1279.2029 +RMS_NORM == +norm-1{1280, 7, 1, 1} n=1280 48.424751 + 0.2028 1.2028 2.2028..., 1277.2029 1278.2029 1279.2029 + + +norm-1{1280, 7, 1, 1} n=1280 32.548176 + -0.0221 0.9779 1.9779..., 1276.9779 1277.9779 1278.9779 +blk.1.ffn_norm.weight{1280, 1, 1, 1} n=1280 285.088928 + -0.1091 0.8909 1.8909..., 1276.8909 1277.8909 1278.8909 +MUL == +ffn_norm-1{1280, 7, 1, 1} n=1280 32.548176 + -0.0221 0.9779 1.9779..., 1276.9779 1277.9779 1278.9779 + + +blk.1.ffn_up.weight{1280, 2048, 1, 1} n=1280 0.000000 + 0.0670 0.0671 0.0671..., 0.1649 0.1650 0.1652 +ffn_norm-1{1280, 7, 1, 1} n=1280 32.548176 + -0.0221 0.9779 1.9779..., 1276.9779 1277.9779 1278.9779 +MUL_MAT == +ffn_silu-1{2048, 7, 1, 1} n=1280 -663.671143 + 0.3066 1.3066 2.3066..., 2045.3066 2046.3066 2047.3066 + + +ffn_silu-1{2048, 7, 1, 1} n=2048 -663.671143 + 0.3066 1.3066 2.3066..., 2045.3066 2046.3066 2047.3066 +VIEW == +ffn_up-1 (view){1024, 7, 1, 1} n=2048 -637.640869 + 0.3066 1.3066 2.3066..., 1021.3066 1022.3066 1023.3066 + + +ffn_up-1 (view){1024, 7, 1, 1} n=1024 -637.640869 + 0.3066 1.3066 2.3066..., 1021.3066 1022.3066 1023.3066 +CONT == +ffn_up-1 (view) (cont){1024, 7, 1, 1} n=1024 -637.640869 + 0.3066 1.3066 2.3066..., 1021.3066 1022.3066 1023.3066 + + +ffn_up-1 (view) (cont){1024, 7, 1, 1} n=1024 29.832624 + 0.1766 1.1766 2.1766..., 1021.1766 1022.1766 1023.1766 +SILU == +node_81{1024, 7, 1, 1} n=1024 29.832624 + 0.1766 1.1766 2.1766..., 1021.1766 1022.1766 1023.1766 + + +ffn_silu-1{2048, 7, 1, 1} n=2048 -663.671143 + 0.3066 1.3066 2.3066..., 2045.3066 2046.3066 2047.3066 +VIEW == +ffn_up-1 (view){1024, 7, 1, 1} n=2048 -26.030380 + -3.4124 -2.4124 -1.4124..., 1017.5876 1018.5876 1019.5876 + + +ffn_up-1 (view){1024, 7, 1, 1} n=1024 -26.030380 + -3.4124 -2.4124 -1.4124..., 1017.5876 1018.5876 1019.5876 +CONT == +ffn_up-1 (view) (cont){1024, 7, 1, 1} n=1024 -26.030380 + -3.4124 -2.4124 -1.4124..., 1017.5876 1018.5876 1019.5876 + + +node_81{1024, 7, 1, 1} n=1024 -398.729126 + -0.6027 0.3973 1.3973..., 1020.3973 1021.3973 1022.3973 +ffn_up-1 (view) (cont){1024, 7, 1, 1} n=1024 -26.030380 + -3.4124 -2.4124 -1.4124..., 1017.5876 1018.5876 1019.5876 +MUL == +ffn_mul-1{1024, 7, 1, 1} n=1024 -398.729126 + -0.6027 0.3973 1.3973..., 1020.3973 1021.3973 1022.3973 + + +blk.1.ffn_down.weight{1024, 1280, 1, 1} n=1024 -0.000000 + -0.0338 -0.0338 -0.0338..., -0.0673 -0.0674 -0.0674 +ffn_mul-1{1024, 7, 1, 1} n=1024 -398.729126 + -0.6027 0.3973 1.3973..., 1020.3973 1021.3973 1022.3973 +MUL_MAT == +ffn_out-1{1280, 7, 1, 1} n=1024 -1479.238403 + -1.9445 -0.9445 0.0555..., 1275.0554 1276.0554 1277.0554 + + +node_73{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 +ffn_out-1{1280, 7, 1, 1} n=1280 -1479.238403 + -1.9445 -0.9445 0.0555..., 1275.0554 1276.0554 1277.0554 +ADD == +l_out-1{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 + + +l_out-1{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 +RMS_NORM == +norm-2{1280, 7, 1, 1} n=1280 -35.124786 + -0.0482 0.9518 1.9518..., 1276.9518 1277.9518 1278.9518 + + +norm-2{1280, 7, 1, 1} n=1280 -1.596347 + -0.0118 0.9882 1.9882..., 1276.9883 1277.9883 1278.9883 +blk.2.attn_norm.weight{1280, 1, 1, 1} n=1280 187.915939 + 0.2446 1.2446 2.2446..., 1277.2445 1278.2445 1279.2445 +MUL == +attn_norm-2{1280, 7, 1, 1} n=1280 -1.596347 + -0.0118 0.9882 1.9882..., 1276.9883 1277.9883 1278.9883 + + +blk.2.attn_qkv.weight{1280, 1152, 1, 1} n=1280 0.000000 + 0.1643 0.1644 0.1646..., 0.3904 0.3906 0.3909 +attn_norm-2{1280, 7, 1, 1} n=1280 -1.596347 + -0.0118 0.9882 1.9882..., 1276.9883 1277.9883 1278.9883 +MUL_MAT == +wqkv-2{1152, 7, 1, 1} n=1280 -1.204950 + 0.0266 1.0266 2.0266..., 1149.0266 1150.0266 1151.0266 + + +wqkv-2{1152, 7, 1, 1} n=1152 -1.204950 + 0.0266 1.0266 2.0266..., 1149.0266 1150.0266 1151.0266 +VIEW == +wqkv-2 (view){64, 7, 12, 1} n=1152 0.777569 + 0.0266 1.0266 2.0266..., 61.0266 62.0266 63.0266 + + +wqkv-2 (view){64, 7, 12, 1} n=64 0.777569 + 0.0266 1.0266 2.0266..., 61.0266 62.0266 63.0266 +CONT == +Qcur-2{64, 7, 12, 1} n=64 0.777569 + 0.0266 1.0266 2.0266..., 61.0266 62.0266 63.0266 + + +Qcur-2{64, 7, 12, 1} n=64 6.013102 + 0.2055 1.2055 2.2055..., 61.2055 62.2055 63.2055 +RMS_NORM == +norm-2{64, 7, 12, 1} n=64 6.013102 + 0.2055 1.2055 2.2055..., 61.2055 62.2055 63.2055 + + +norm-2{64, 7, 12, 1} n=64 0.562160 + 0.3169 1.3169 2.3169..., 61.3169 62.3169 63.3169 +blk.2.attn_q_norm.weight{64, 1, 1, 1} n=64 100.804596 + 1.5425 2.5425 3.5425..., 62.5425 63.5425 64.5425 +MUL == +Qcur-2{64, 7, 12, 1} n=64 0.562160 + 0.3169 1.3169 2.3169..., 61.3169 62.3169 63.3169 + + +Qcur-2{64, 7, 12, 1} n=64 0.562160 + 0.3169 1.3169 2.3169..., 61.3169 62.3169 63.3169 +RESHAPE == +Qcur-2 (reshaped){64, 12, 7, 1} n=64 0.562160 + 0.3169 1.3169 2.3169..., 61.3169 62.3169 63.3169 + + +Qcur-2 (reshaped){64, 12, 7, 1} n=64 0.562160 + 0.3169 1.3169 2.3169..., 61.3169 62.3169 63.3169 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-2{64, 12, 7, 1} n=64 0.562160 + 0.3169 1.3169 2.3169..., 61.3169 62.3169 63.3169 + + +Qcur-2{64, 12, 7, 1} n=64 0.070270 + 0.0396 1.0396 2.0396..., 61.0396 62.0396 63.0396 +SCALE == +Qcur-2{64, 12, 7, 1} n=64 0.070270 + 0.0396 1.0396 2.0396..., 61.0396 62.0396 63.0396 + + +wqkv-2{1152, 7, 1, 1} n=1152 -1.204950 + 0.0266 1.0266 2.0266..., 1149.0266 1150.0266 1151.0266 +VIEW == +wqkv-2 (view){64, 7, 3, 1} n=1152 0.262397 + -0.0202 0.9798 1.9798..., 60.9798 61.9798 62.9798 + + +wqkv-2 (view){64, 7, 3, 1} n=64 0.262397 + -0.0202 0.9798 1.9798..., 60.9798 61.9798 62.9798 +CONT == +Kcur-2{64, 7, 3, 1} n=64 0.262397 + -0.0202 0.9798 1.9798..., 60.9798 61.9798 62.9798 + + +Kcur-2{64, 7, 3, 1} n=64 2.964299 + -0.2277 0.7723 1.7723..., 60.7723 61.7723 62.7723 +RMS_NORM == +norm-2{64, 7, 3, 1} n=64 2.964299 + -0.2277 0.7723 1.7723..., 60.7723 61.7723 62.7723 + + +norm-2{64, 7, 3, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 +blk.2.attn_k_norm.weight{64, 1, 1, 1} n=64 124.295204 + 1.3662 2.3662 3.3662..., 62.3662 63.3662 64.3662 +MUL == +Kcur-2{64, 7, 3, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 + + +Kcur-2{64, 7, 3, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 +RESHAPE == +Kcur-2 (reshaped){64, 3, 7, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 + + +Kcur-2 (reshaped){64, 3, 7, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_102{64, 3, 7, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 + + +node_102{64, 3, 7, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 +REPEAT == +node_103{64, 3, 28, 1} n=64 15.649212 + -0.3111 0.6889 1.6889..., 60.6889 61.6889 62.6889 + + +wqkv-2{1152, 7, 1, 1} n=1152 -1.204950 + 0.0266 1.0266 2.0266..., 1149.0266 1150.0266 1151.0266 +VIEW == +wqkv-2 (view){64, 7, 3, 1} n=1152 234585765104063608984308809728.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-2 (view){64, 7, 3, 1} n=64 234585765104063608984308809728.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-2{64, 7, 3, 1} n=64 234585765104063608984308809728.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-2{64, 7, 3, 1} n=64 234585765104063608984308809728.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_106{64, 7, 12, 1} n=64 234585765104063608984308809728.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l2{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-2{32, 64, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l2{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-2{64, 32, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-2{64, 12, 7, 1} n=64 0.070270 + 0.0396 1.0396 2.0396..., 61.0396 62.0396 63.0396 +PERMUTE == +q-2{64, 7, 12, 1} n=64 0.070270 + 0.0396 1.0396 2.0396..., 61.0396 62.0396 63.0396 + + +k-2{64, 32, 6, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-2{64, 7, 12, 1} n=64 0.070270 + 0.0396 1.0396 2.0396..., 61.0396 62.0396 63.0396 +MUL_MAT == +kq-2{32, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-2{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-2{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-2{32, 64, 6, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-2{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-2{64, 7, 12, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-2{64, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-2{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-2{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-2{768, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 + + +blk.2.attn_output.weight{768, 1280, 1, 1} n=768 0.000049 + -0.0764 -0.0765 -0.0765..., -0.1231 -0.1232 -0.1232 +kqv_merged_cont-2{768, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 +MUL_MAT == +kqv_out-2{1280, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-2{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 +l_out-1{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 +ADD == +node_116{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 + + +node_116{1280, 7, 1, 1} n=1280 -1475.513306 + -2.0232 -1.0232 -0.0232..., 1274.9768 1275.9768 1276.9768 +RMS_NORM == +norm-2{1280, 7, 1, 1} n=1280 -35.124786 + -0.0482 0.9518 1.9518..., 1276.9518 1277.9518 1278.9518 + + +norm-2{1280, 7, 1, 1} n=1280 0.152442 + -0.0101 0.9899 1.9899..., 1276.9900 1277.9900 1278.9900 +blk.2.ffn_norm.weight{1280, 1, 1, 1} n=1280 347.058319 + 0.2090 1.2090 2.2090..., 1277.2090 1278.2090 1279.2090 +MUL == +ffn_norm-2{1280, 7, 1, 1} n=1280 0.152442 + -0.0101 0.9899 1.9899..., 1276.9900 1277.9900 1278.9900 + + +ffn_norm-2{1280, 7, 1, 1} n=1280 20.204306 + -1.3341 -0.3341 0.6659..., 1275.6659 1276.6659 1277.6659 +RMS_NORM == +norm-2{1280, 7, 1, 1} n=1280 20.204306 + -1.3341 -0.3341 0.6659..., 1275.6659 1276.6659 1277.6659 + + +norm-2{1280, 7, 1, 1} n=1280 3.523221 + -0.2788 0.7212 1.7212..., 1276.7212 1277.7212 1278.7212 +blk.2.ffn_norm.weight{1280, 1, 1, 1} n=1280 347.058319 + 0.2090 1.2090 2.2090..., 1277.2090 1278.2090 1279.2090 +MUL == +ffn_norm-2{1280, 7, 1, 1} n=1280 3.523221 + -0.2788 0.7212 1.7212..., 1276.7212 1277.7212 1278.7212 + + +blk.2.ffn_up.weight{1280, 2560, 1, 1} n=1280 0.000000 + 0.0556 0.0556 0.0557..., 0.1283 0.1284 0.1285 +ffn_norm-2{1280, 7, 1, 1} n=1280 3.523221 + -0.2788 0.7212 1.7212..., 1276.7212 1277.7212 1278.7212 +MUL_MAT == +ffn_silu-2{2560, 7, 1, 1} n=1280 618.344727 + -0.3218 0.6782 1.6782..., 2556.6782 2557.6782 2558.6782 + + +ffn_silu-2{2560, 7, 1, 1} n=2560 618.344727 + -0.3218 0.6782 1.6782..., 2556.6782 2557.6782 2558.6782 +VIEW == +ffn_up-2 (view){1280, 7, 1, 1} n=2560 645.956604 + -0.3218 0.6782 1.6782..., 1276.6782 1277.6782 1278.6782 + + +ffn_up-2 (view){1280, 7, 1, 1} n=1280 645.956604 + -0.3218 0.6782 1.6782..., 1276.6782 1277.6782 1278.6782 +CONT == +ffn_up-2 (view) (cont){1280, 7, 1, 1} n=1280 645.956604 + -0.3218 0.6782 1.6782..., 1276.6782 1277.6782 1278.6782 + + +ffn_up-2 (view) (cont){1280, 7, 1, 1} n=1280 650.200745 + -0.1353 0.8647 1.8647..., 1276.8647 1277.8647 1278.8647 +SILU == +node_124{1280, 7, 1, 1} n=1280 650.200745 + -0.1353 0.8647 1.8647..., 1276.8647 1277.8647 1278.8647 + + +ffn_silu-2{2560, 7, 1, 1} n=2560 618.344727 + -0.3218 0.6782 1.6782..., 2556.6782 2557.6782 2558.6782 +VIEW == +ffn_up-2 (view){1280, 7, 1, 1} n=2560 -27.611546 + 0.4662 1.4662 2.4662..., 1277.4662 1278.4662 1279.4662 + + +ffn_up-2 (view){1280, 7, 1, 1} n=1280 -27.611546 + 0.4662 1.4662 2.4662..., 1277.4662 1278.4662 1279.4662 +CONT == +ffn_up-2 (view) (cont){1280, 7, 1, 1} n=1280 -27.611546 + 0.4662 1.4662 2.4662..., 1277.4662 1278.4662 1279.4662 + + +node_124{1280, 7, 1, 1} n=1280 -15.922530 + -0.0631 0.9369 1.9369..., 1276.9369 1277.9369 1278.9369 +ffn_up-2 (view) (cont){1280, 7, 1, 1} n=1280 -27.611546 + 0.4662 1.4662 2.4662..., 1277.4662 1278.4662 1279.4662 +MUL == +ffn_mul-2{1280, 7, 1, 1} n=1280 -15.922530 + -0.0631 0.9369 1.9369..., 1276.9369 1277.9369 1278.9369 + + +blk.2.ffn_down.weight{1280, 1280, 1, 1} n=1280 0.000000 + -0.0264 -0.0264 -0.0264..., -0.0605 -0.0605 -0.0605 +ffn_mul-2{1280, 7, 1, 1} n=1280 -15.922530 + -0.0631 0.9369 1.9369..., 1276.9369 1277.9369 1278.9369 +MUL_MAT == +ffn_out-2{1280, 7, 1, 1} n=1280 29.182533 + -2.5860 -1.5860 -0.5860..., 1274.4141 1275.4141 1276.4141 + + +node_116{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 +ffn_out-2{1280, 7, 1, 1} n=1280 29.182533 + -2.5860 -1.5860 -0.5860..., 1274.4141 1275.4141 1276.4141 +ADD == +l_out-2{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 + + +l_out-2{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 +RMS_NORM == +norm-3{1280, 7, 1, 1} n=1280 -34.634033 + -0.1104 0.8896 1.8896..., 1276.8896 1277.8896 1278.8896 + + +norm-3{1280, 7, 1, 1} n=1280 -1.200820 + -0.0244 0.9756 1.9756..., 1276.9756 1277.9756 1278.9756 +blk.3.attn_norm.weight{1280, 1, 1, 1} n=1280 140.774414 + 0.2209 1.2209 2.2209..., 1277.2208 1278.2208 1279.2208 +MUL == +attn_norm-3{1280, 7, 1, 1} n=1280 -1.200820 + -0.0244 0.9756 1.9756..., 1276.9756 1277.9756 1278.9756 + + +blk.3.attn_qkv.weight{1280, 1152, 1, 1} n=1280 -0.000085 + -0.0556 -0.0557 -0.0557..., -0.1284 -0.1285 -0.1287 +attn_norm-3{1280, 7, 1, 1} n=1280 -1.200820 + -0.0244 0.9756 1.9756..., 1276.9756 1277.9756 1278.9756 +MUL_MAT == +wqkv-3{1152, 7, 1, 1} n=1280 0.253290 + -0.0003 0.9997 1.9997..., 1148.9998 1149.9998 1150.9998 + + +wqkv-3{1152, 7, 1, 1} n=1152 0.253290 + -0.0003 0.9997 1.9997..., 1148.9998 1149.9998 1150.9998 +VIEW == +wqkv-3 (view){64, 7, 12, 1} n=1152 -0.141029 + -0.0003 0.9997 1.9997..., 60.9997 61.9997 62.9997 + + +wqkv-3 (view){64, 7, 12, 1} n=64 -0.141029 + -0.0003 0.9997 1.9997..., 60.9997 61.9997 62.9997 +CONT == +Qcur-3{64, 7, 12, 1} n=64 -0.141029 + -0.0003 0.9997 1.9997..., 60.9997 61.9997 62.9997 + + +Qcur-3{64, 7, 12, 1} n=64 -2.261387 + -0.0046 0.9954 1.9954..., 60.9954 61.9954 62.9954 +RMS_NORM == +norm-3{64, 7, 12, 1} n=64 -2.261387 + -0.0046 0.9954 1.9954..., 60.9954 61.9954 62.9954 + + +norm-3{64, 7, 12, 1} n=64 -5.381715 + -0.0079 0.9921 1.9921..., 60.9921 61.9921 62.9921 +blk.3.attn_q_norm.weight{64, 1, 1, 1} n=64 127.759201 + 1.7408 2.7408 3.7408..., 62.7408 63.7408 64.7408 +MUL == +Qcur-3{64, 7, 12, 1} n=64 -5.381715 + -0.0079 0.9921 1.9921..., 60.9921 61.9921 62.9921 + + +Qcur-3{64, 7, 12, 1} n=64 -5.381715 + -0.0079 0.9921 1.9921..., 60.9921 61.9921 62.9921 +RESHAPE == +Qcur-3 (reshaped){64, 12, 7, 1} n=64 -5.381715 + -0.0079 0.9921 1.9921..., 60.9921 61.9921 62.9921 + + +Qcur-3 (reshaped){64, 12, 7, 1} n=64 -5.381715 + -0.0079 0.9921 1.9921..., 60.9921 61.9921 62.9921 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-3{64, 12, 7, 1} n=64 -5.381715 + -0.0079 0.9921 1.9921..., 60.9921 61.9921 62.9921 + + +Qcur-3{64, 12, 7, 1} n=64 -0.672714 + -0.0010 0.9990 1.9990..., 60.9990 61.9990 62.9990 +SCALE == +Qcur-3{64, 12, 7, 1} n=64 -0.672714 + -0.0010 0.9990 1.9990..., 60.9990 61.9990 62.9990 + + +wqkv-3{1152, 7, 1, 1} n=1152 0.253290 + -0.0003 0.9997 1.9997..., 1148.9998 1149.9998 1150.9998 +VIEW == +wqkv-3 (view){64, 7, 3, 1} n=1152 -0.318371 + -0.0034 0.9966 1.9966..., 60.9966 61.9966 62.9966 + + +wqkv-3 (view){64, 7, 3, 1} n=64 -0.318371 + -0.0034 0.9966 1.9966..., 60.9966 61.9966 62.9966 +CONT == +Kcur-3{64, 7, 3, 1} n=64 -0.318371 + -0.0034 0.9966 1.9966..., 60.9966 61.9966 62.9966 + + +Kcur-3{64, 7, 3, 1} n=64 -2.885487 + -0.0308 0.9692 1.9692..., 60.9692 61.9692 62.9692 +RMS_NORM == +norm-3{64, 7, 3, 1} n=64 -2.885487 + -0.0308 0.9692 1.9692..., 60.9692 61.9692 62.9692 + + +norm-3{64, 7, 3, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 +blk.3.attn_k_norm.weight{64, 1, 1, 1} n=64 151.676163 + 1.7976 2.7976 3.7976..., 62.7976 63.7976 64.7976 +MUL == +Kcur-3{64, 7, 3, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 + + +Kcur-3{64, 7, 3, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 +RESHAPE == +Kcur-3 (reshaped){64, 3, 7, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 + + +Kcur-3 (reshaped){64, 3, 7, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_145{64, 3, 7, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 + + +node_145{64, 3, 7, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 +REPEAT == +node_146{64, 3, 28, 1} n=64 34.004799 + -0.0553 0.9447 1.9447..., 60.9447 61.9447 62.9447 + + +wqkv-3{1152, 7, 1, 1} n=1152 0.253290 + -0.0003 0.9997 1.9997..., 1148.9998 1149.9998 1150.9998 +VIEW == +wqkv-3 (view){64, 7, 3, 1} n=1152 -2.943815 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-3 (view){64, 7, 3, 1} n=64 -2.943815 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-3{64, 7, 3, 1} n=64 -2.943815 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-3{64, 7, 3, 1} n=64 -2.943815 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_149{64, 7, 12, 1} n=64 -2.943815 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l3{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-3{32, 64, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l3{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-3{64, 32, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-3{64, 12, 7, 1} n=64 -0.672714 + -0.0010 0.9990 1.9990..., 60.9990 61.9990 62.9990 +PERMUTE == +q-3{64, 7, 12, 1} n=64 -0.672714 + -0.0010 0.9990 1.9990..., 60.9990 61.9990 62.9990 + + +k-3{64, 32, 6, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-3{64, 7, 12, 1} n=64 -0.672714 + -0.0010 0.9990 1.9990..., 60.9990 61.9990 62.9990 +MUL_MAT == +kq-3{32, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-3{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-3{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-3{32, 64, 6, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-3{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-3{64, 7, 12, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-3{64, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-3{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-3{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-3{768, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 + + +blk.3.attn_output.weight{768, 1280, 1, 1} n=768 0.000001 + 0.0239 0.0239 0.0239..., 0.0399 0.0399 0.0399 +kqv_merged_cont-3{768, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 +MUL_MAT == +kqv_out-3{1280, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-3{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 +l_out-2{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 +ADD == +node_159{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 + + +node_159{1280, 7, 1, 1} n=1280 -1446.331055 + -4.6092 -3.6092 -2.6092..., 1272.3909 1273.3909 1274.3909 +RMS_NORM == +norm-3{1280, 7, 1, 1} n=1280 -34.634033 + -0.1104 0.8896 1.8896..., 1276.8896 1277.8896 1278.8896 + + +norm-3{1280, 7, 1, 1} n=1280 0.118420 + -0.0219 0.9781 1.9781..., 1276.9780 1277.9780 1278.9780 +blk.3.ffn_norm.weight{1280, 1, 1, 1} n=1280 381.152802 + 0.1987 1.1987 2.1987..., 1277.1987 1278.1987 1279.1987 +MUL == +ffn_norm-3{1280, 7, 1, 1} n=1280 0.118420 + -0.0219 0.9781 1.9781..., 1276.9780 1277.9780 1278.9780 + + +ffn_norm-3{1280, 7, 1, 1} n=1280 8.907544 + -1.6499 -0.6499 0.3501..., 1275.3501 1276.3501 1277.3501 +RMS_NORM == +norm-3{1280, 7, 1, 1} n=1280 8.907544 + -1.6499 -0.6499 0.3501..., 1275.3501 1276.3501 1277.3501 + + +norm-3{1280, 7, 1, 1} n=1280 2.850517 + -0.3279 0.6721 1.6721..., 1276.6721 1277.6721 1278.6721 +blk.3.ffn_norm.weight{1280, 1, 1, 1} n=1280 381.152802 + 0.1987 1.1987 2.1987..., 1277.1987 1278.1987 1279.1987 +MUL == +ffn_norm-3{1280, 7, 1, 1} n=1280 2.850517 + -0.3279 0.6721 1.6721..., 1276.6721 1277.6721 1278.6721 + + +blk.3.ffn_up.weight{1280, 3072, 1, 1} n=1280 0.000000 + -0.0214 -0.0214 -0.0214..., -0.0505 -0.0505 -0.0506 +ffn_norm-3{1280, 7, 1, 1} n=1280 2.850517 + -0.3279 0.6721 1.6721..., 1276.6721 1277.6721 1278.6721 +MUL_MAT == +ffn_silu-3{3072, 7, 1, 1} n=1280 273.070343 + 0.3892 1.3892 2.3892..., 3069.3892 3070.3892 3071.3892 + + +ffn_silu-3{3072, 7, 1, 1} n=3072 273.070343 + 0.3892 1.3892 2.3892..., 3069.3892 3070.3892 3071.3892 +VIEW == +ffn_up-3 (view){1536, 7, 1, 1} n=3072 300.748810 + 0.3892 1.3892 2.3892..., 1533.3892 1534.3892 1535.3892 + + +ffn_up-3 (view){1536, 7, 1, 1} n=1536 300.748810 + 0.3892 1.3892 2.3892..., 1533.3892 1534.3892 1535.3892 +CONT == +ffn_up-3 (view) (cont){1536, 7, 1, 1} n=1536 300.748810 + 0.3892 1.3892 2.3892..., 1533.3892 1534.3892 1535.3892 + + +ffn_up-3 (view) (cont){1536, 7, 1, 1} n=1536 299.481659 + 0.2319 1.2319 2.2319..., 1533.2319 1534.2319 1535.2319 +SILU == +node_167{1536, 7, 1, 1} n=1536 299.481659 + 0.2319 1.2319 2.2319..., 1533.2319 1534.2319 1535.2319 + + +ffn_silu-3{3072, 7, 1, 1} n=3072 273.070343 + 0.3892 1.3892 2.3892..., 3069.3892 3070.3892 3071.3892 +VIEW == +ffn_up-3 (view){1536, 7, 1, 1} n=3072 -27.678528 + 0.2873 1.2873 2.2873..., 1533.2874 1534.2874 1535.2874 + + +ffn_up-3 (view){1536, 7, 1, 1} n=1536 -27.678528 + 0.2873 1.2873 2.2873..., 1533.2874 1534.2874 1535.2874 +CONT == +ffn_up-3 (view) (cont){1536, 7, 1, 1} n=1536 -27.678528 + 0.2873 1.2873 2.2873..., 1533.2874 1534.2874 1535.2874 + + +node_167{1536, 7, 1, 1} n=1536 -11.729805 + 0.0666 1.0666 2.0666..., 1533.0667 1534.0667 1535.0667 +ffn_up-3 (view) (cont){1536, 7, 1, 1} n=1536 -27.678528 + 0.2873 1.2873 2.2873..., 1533.2874 1534.2874 1535.2874 +MUL == +ffn_mul-3{1536, 7, 1, 1} n=1536 -11.729805 + 0.0666 1.0666 2.0666..., 1533.0667 1534.0667 1535.0667 + + +blk.3.ffn_down.weight{1536, 1280, 1, 1} n=1536 -0.000000 + -0.0179 -0.0180 -0.0180..., -0.0514 -0.0515 -0.0515 +ffn_mul-3{1536, 7, 1, 1} n=1536 -11.729805 + 0.0666 1.0666 2.0666..., 1533.0667 1534.0667 1535.0667 +MUL_MAT == +ffn_out-3{1280, 7, 1, 1} n=1536 -30.904457 + 0.7055 1.7055 2.7055..., 1277.7056 1278.7056 1279.7056 + + +node_159{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 +ffn_out-3{1280, 7, 1, 1} n=1280 -30.904457 + 0.7055 1.7055 2.7055..., 1277.7056 1278.7056 1279.7056 +ADD == +l_out-3{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 + + +l_out-3{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 +RMS_NORM == +norm-4{1280, 7, 1, 1} n=1280 -35.352757 + -0.0934 0.9066 1.9066..., 1276.9066 1277.9066 1278.9066 + + +norm-4{1280, 7, 1, 1} n=1280 2.093910 + -0.0224 0.9776 1.9776..., 1276.9775 1277.9775 1278.9775 +blk.4.attn_norm.weight{1280, 1, 1, 1} n=1280 187.899826 + 0.2399 1.2399 2.2399..., 1277.2399 1278.2399 1279.2399 +MUL == +attn_norm-4{1280, 7, 1, 1} n=1280 2.093910 + -0.0224 0.9776 1.9776..., 1276.9775 1277.9775 1278.9775 + + +blk.4.attn_qkv.weight{1280, 1152, 1, 1} n=1280 -0.000001 + -0.0896 -0.0897 -0.0897..., -0.2101 -0.2102 -0.2103 +attn_norm-4{1280, 7, 1, 1} n=1280 2.093910 + -0.0224 0.9776 1.9776..., 1276.9775 1277.9775 1278.9775 +MUL_MAT == +wqkv-4{1152, 7, 1, 1} n=1280 -2.795094 + 0.0038 1.0038 2.0038..., 1149.0038 1150.0038 1151.0038 + + +wqkv-4{1152, 7, 1, 1} n=1152 -2.795094 + 0.0038 1.0038 2.0038..., 1149.0038 1150.0038 1151.0038 +VIEW == +wqkv-4 (view){64, 7, 12, 1} n=1152 0.357520 + 0.0038 1.0038 2.0038..., 61.0038 62.0038 63.0038 + + +wqkv-4 (view){64, 7, 12, 1} n=64 0.357520 + 0.0038 1.0038 2.0038..., 61.0038 62.0038 63.0038 +CONT == +Qcur-4{64, 7, 12, 1} n=64 0.357520 + 0.0038 1.0038 2.0038..., 61.0038 62.0038 63.0038 + + +Qcur-4{64, 7, 12, 1} n=64 4.766866 + 0.0504 1.0504 2.0504..., 61.0504 62.0504 63.0504 +RMS_NORM == +norm-4{64, 7, 12, 1} n=64 4.766866 + 0.0504 1.0504 2.0504..., 61.0504 62.0504 63.0504 + + +norm-4{64, 7, 12, 1} n=64 6.094522 + 0.0746 1.0746 2.0746..., 61.0746 62.0746 63.0746 +blk.4.attn_q_norm.weight{64, 1, 1, 1} n=64 119.760307 + 1.4809 2.4809 3.4809..., 62.4809 63.4809 64.4809 +MUL == +Qcur-4{64, 7, 12, 1} n=64 6.094522 + 0.0746 1.0746 2.0746..., 61.0746 62.0746 63.0746 + + +Qcur-4{64, 7, 12, 1} n=64 6.094522 + 0.0746 1.0746 2.0746..., 61.0746 62.0746 63.0746 +RESHAPE == +Qcur-4 (reshaped){64, 12, 7, 1} n=64 6.094522 + 0.0746 1.0746 2.0746..., 61.0746 62.0746 63.0746 + + +Qcur-4 (reshaped){64, 12, 7, 1} n=64 6.094522 + 0.0746 1.0746 2.0746..., 61.0746 62.0746 63.0746 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-4{64, 12, 7, 1} n=64 6.094522 + 0.0746 1.0746 2.0746..., 61.0746 62.0746 63.0746 + + +Qcur-4{64, 12, 7, 1} n=64 0.761815 + 0.0093 1.0093 2.0093..., 61.0093 62.0093 63.0093 +SCALE == +Qcur-4{64, 12, 7, 1} n=64 0.761815 + 0.0093 1.0093 2.0093..., 61.0093 62.0093 63.0093 + + +wqkv-4{1152, 7, 1, 1} n=1152 -2.795094 + 0.0038 1.0038 2.0038..., 1149.0038 1150.0038 1151.0038 +VIEW == +wqkv-4 (view){64, 7, 3, 1} n=1152 0.333644 + 0.0302 1.0302 2.0302..., 61.0302 62.0302 63.0302 + + +wqkv-4 (view){64, 7, 3, 1} n=64 0.333644 + 0.0302 1.0302 2.0302..., 61.0302 62.0302 63.0302 +CONT == +Kcur-4{64, 7, 3, 1} n=64 0.333644 + 0.0302 1.0302 2.0302..., 61.0302 62.0302 63.0302 + + +Kcur-4{64, 7, 3, 1} n=64 3.758383 + 0.3403 1.3403 2.3403..., 61.3403 62.3403 63.3403 +RMS_NORM == +norm-4{64, 7, 3, 1} n=64 3.758383 + 0.3403 1.3403 2.3403..., 61.3403 62.3403 63.3403 + + +norm-4{64, 7, 3, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 +blk.4.attn_k_norm.weight{64, 1, 1, 1} n=64 144.497330 + 1.5475 2.5475 3.5475..., 62.5475 63.5475 64.5475 +MUL == +Kcur-4{64, 7, 3, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 + + +Kcur-4{64, 7, 3, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 +RESHAPE == +Kcur-4 (reshaped){64, 3, 7, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 + + +Kcur-4 (reshaped){64, 3, 7, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_188{64, 3, 7, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 + + +node_188{64, 3, 7, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 +REPEAT == +node_189{64, 3, 28, 1} n=64 0.711260 + 0.5266 1.5266 2.5266..., 61.5266 62.5266 63.5266 + + +wqkv-4{1152, 7, 1, 1} n=1152 -2.795094 + 0.0038 1.0038 2.0038..., 1149.0038 1150.0038 1151.0038 +VIEW == +wqkv-4 (view){64, 7, 3, 1} n=1152 -2.976934 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-4 (view){64, 7, 3, 1} n=64 -2.976934 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-4{64, 7, 3, 1} n=64 -2.976934 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-4{64, 7, 3, 1} n=64 -2.976934 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_192{64, 7, 12, 1} n=64 -2.976934 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l4{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-4{32, 64, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l4{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-4{64, 32, 6, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-4{64, 12, 7, 1} n=64 0.761815 + 0.0093 1.0093 2.0093..., 61.0093 62.0093 63.0093 +PERMUTE == +q-4{64, 7, 12, 1} n=64 0.761815 + 0.0093 1.0093 2.0093..., 61.0093 62.0093 63.0093 + + +k-4{64, 32, 6, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-4{64, 7, 12, 1} n=64 0.761815 + 0.0093 1.0093 2.0093..., 61.0093 62.0093 63.0093 +MUL_MAT == +kq-4{32, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-4{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-4{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-4{32, 64, 6, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-4{32, 7, 12, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-4{64, 7, 12, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-4{64, 7, 12, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-4{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-4{64, 12, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-4{768, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 + + +blk.4.attn_output.weight{768, 1280, 1, 1} n=768 0.000001 + -0.0368 -0.0368 -0.0369..., -0.0602 -0.0602 -0.0602 +kqv_merged_cont-4{768, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 765.0000 766.0000 767.0000 +MUL_MAT == +kqv_out-4{1280, 7, 1, 1} n=768 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-4{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 +l_out-3{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 +ADD == +node_202{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 + + +node_202{1280, 7, 1, 1} n=1280 -1477.238159 + -3.9037 -2.9037 -1.9037..., 1273.0963 1274.0963 1275.0963 +RMS_NORM == +norm-4{1280, 7, 1, 1} n=1280 -35.352757 + -0.0934 0.9066 1.9066..., 1276.9066 1277.9066 1278.9066 + + +norm-4{1280, 7, 1, 1} n=1280 -0.010547 + -0.0204 0.9796 1.9796..., 1276.9796 1277.9796 1278.9796 +blk.4.ffn_norm.weight{1280, 1, 1, 1} n=1280 420.961090 + 0.2186 1.2186 2.2186..., 1277.2186 1278.2186 1279.2186 +MUL == +ffn_norm-4{1280, 7, 1, 1} n=1280 -0.010547 + -0.0204 0.9796 1.9796..., 1276.9796 1277.9796 1278.9796 + + +ffn_norm-4{1280, 7, 1, 1} n=1280 -0.707272 + -1.3697 -0.3697 0.6303..., 1275.6302 1276.6302 1277.6302 +RMS_NORM == +norm-4{1280, 7, 1, 1} n=1280 -0.707272 + -1.3697 -0.3697 0.6303..., 1275.6302 1276.6302 1277.6302 + + +norm-4{1280, 7, 1, 1} n=1280 -0.481583 + -0.2995 0.7005 1.7005..., 1276.7006 1277.7006 1278.7006 +blk.4.ffn_norm.weight{1280, 1, 1, 1} n=1280 420.961090 + 0.2186 1.2186 2.2186..., 1277.2186 1278.2186 1279.2186 +MUL == +ffn_norm-4{1280, 7, 1, 1} n=1280 -0.481583 + -0.2995 0.7005 1.7005..., 1276.7006 1277.7006 1278.7006 + + +blk.4.ffn_up.weight{1280, 3584, 1, 1} n=1280 -0.000000 + 0.0029 0.0029 0.0029..., 0.0067 0.0067 0.0067 +ffn_norm-4{1280, 7, 1, 1} n=1280 -0.481583 + -0.2995 0.7005 1.7005..., 1276.7006 1277.7006 1278.7006 +MUL_MAT == +ffn_silu-4{3584, 7, 1, 1} n=1280 445.606812 + 0.5861 1.5861 2.5861..., 3581.5862 3582.5862 3583.5862 + + +ffn_silu-4{3584, 7, 1, 1} n=3584 445.606812 + 0.5861 1.5861 2.5861..., 3581.5862 3582.5862 3583.5862 +VIEW == +ffn_up-4 (view){1792, 7, 1, 1} n=3584 432.058380 + 0.5861 1.5861 2.5861..., 1789.5862 1790.5862 1791.5862 + + +ffn_up-4 (view){1792, 7, 1, 1} n=1792 432.058380 + 0.5861 1.5861 2.5861..., 1789.5862 1790.5862 1791.5862 +CONT == +ffn_up-4 (view) (cont){1792, 7, 1, 1} n=1792 432.058380 + 0.5861 1.5861 2.5861..., 1789.5862 1790.5862 1791.5862 + + +ffn_up-4 (view) (cont){1792, 7, 1, 1} n=1792 421.246368 + 0.3765 1.3765 2.3765..., 1789.3765 1790.3765 1791.3765 +SILU == +node_210{1792, 7, 1, 1} n=1792 421.246368 + 0.3765 1.3765 2.3765..., 1789.3765 1790.3765 1791.3765 + + +ffn_silu-4{3584, 7, 1, 1} n=3584 445.606812 + 0.5861 1.5861 2.5861..., 3581.5862 3582.5862 3583.5862 +VIEW == +ffn_up-4 (view){1792, 7, 1, 1} n=3584 13.548252 + 0.2174 1.2174 2.2174..., 1789.2174 1790.2174 1791.2174 + + +ffn_up-4 (view){1792, 7, 1, 1} n=1792 13.548252 + 0.2174 1.2174 2.2174..., 1789.2174 1790.2174 1791.2174 +CONT == +ffn_up-4 (view) (cont){1792, 7, 1, 1} n=1792 13.548252 + 0.2174 1.2174 2.2174..., 1789.2174 1790.2174 1791.2174 + + +node_210{1792, 7, 1, 1} n=1792 15.007300 + 0.0818 1.0818 2.0818..., 1789.0818 1790.0818 1791.0818 +ffn_up-4 (view) (cont){1792, 7, 1, 1} n=1792 13.548252 + 0.2174 1.2174 2.2174..., 1789.2174 1790.2174 1791.2174 +MUL == +ffn_mul-4{1792, 7, 1, 1} n=1792 15.007300 + 0.0818 1.0818 2.0818..., 1789.0818 1790.0818 1791.0818 + + +blk.4.ffn_down.weight{1792, 1280, 1, 1} n=1792 0.000000 + -0.0281 -0.0281 -0.0281..., -0.0966 -0.0967 -0.0967 +ffn_mul-4{1792, 7, 1, 1} n=1792 15.007300 + 0.0818 1.0818 2.0818..., 1789.0818 1790.0818 1791.0818 +MUL_MAT == +ffn_out-4{1280, 7, 1, 1} n=1792 56.152103 + -0.4933 0.5067 1.5067..., 1276.5067 1277.5067 1278.5067 + + +node_202{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 +ffn_out-4{1280, 7, 1, 1} n=1280 56.152103 + -0.4933 0.5067 1.5067..., 1276.5067 1277.5067 1278.5067 +ADD == +l_out-4{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 + + +l_out-4{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 +RMS_NORM == +norm-5{1280, 7, 1, 1} n=1280 -33.967426 + -0.1051 0.8949 1.8949..., 1276.8949 1277.8949 1278.8949 + + +norm-5{1280, 7, 1, 1} n=1280 -2.186204 + -0.0344 0.9656 1.9656..., 1276.9656 1277.9656 1278.9656 +blk.5.attn_norm.weight{1280, 1, 1, 1} n=1280 214.716324 + 0.3273 1.3273 2.3273..., 1277.3273 1278.3273 1279.3273 +MUL == +attn_norm-5{1280, 7, 1, 1} n=1280 -2.186204 + -0.0344 0.9656 1.9656..., 1276.9656 1277.9656 1278.9656 + + +blk.5.attn_qkv.weight{1280, 1536, 1, 1} n=1280 -0.000002 + 0.0171 0.0171 0.0171..., 0.0418 0.0419 0.0419 +attn_norm-5{1280, 7, 1, 1} n=1280 -2.186204 + -0.0344 0.9656 1.9656..., 1276.9656 1277.9656 1278.9656 +MUL_MAT == +wqkv-5{1536, 7, 1, 1} n=1280 -6.933201 + 0.0025 1.0025 2.0025..., 1533.0024 1534.0024 1535.0024 + + +wqkv-5{1536, 7, 1, 1} n=1536 -6.933201 + 0.0025 1.0025 2.0025..., 1533.0024 1534.0024 1535.0024 +VIEW == +wqkv-5 (view){64, 7, 16, 1} n=1536 -0.032918 + 0.0025 1.0025 2.0025..., 61.0025 62.0025 63.0025 + + +wqkv-5 (view){64, 7, 16, 1} n=64 -0.032918 + 0.0025 1.0025 2.0025..., 61.0025 62.0025 63.0025 +CONT == +Qcur-5{64, 7, 16, 1} n=64 -0.032918 + 0.0025 1.0025 2.0025..., 61.0025 62.0025 63.0025 + + +Qcur-5{64, 7, 16, 1} n=64 -0.307985 + 0.0230 1.0230 2.0230..., 61.0230 62.0230 63.0230 +RMS_NORM == +norm-5{64, 7, 16, 1} n=64 -0.307985 + 0.0230 1.0230 2.0230..., 61.0230 62.0230 63.0230 + + +norm-5{64, 7, 16, 1} n=64 -8.069510 + 0.0221 1.0221 2.0221..., 61.0221 62.0221 63.0221 +blk.5.attn_q_norm.weight{64, 1, 1, 1} n=64 125.485550 + 0.9609 1.9609 2.9609..., 61.9609 62.9609 63.9609 +MUL == +Qcur-5{64, 7, 16, 1} n=64 -8.069510 + 0.0221 1.0221 2.0221..., 61.0221 62.0221 63.0221 + + +Qcur-5{64, 7, 16, 1} n=64 -8.069510 + 0.0221 1.0221 2.0221..., 61.0221 62.0221 63.0221 +RESHAPE == +Qcur-5 (reshaped){64, 16, 7, 1} n=64 -8.069510 + 0.0221 1.0221 2.0221..., 61.0221 62.0221 63.0221 + + +Qcur-5 (reshaped){64, 16, 7, 1} n=64 -8.069510 + 0.0221 1.0221 2.0221..., 61.0221 62.0221 63.0221 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-5{64, 16, 7, 1} n=64 -8.069510 + 0.0221 1.0221 2.0221..., 61.0221 62.0221 63.0221 + + +Qcur-5{64, 16, 7, 1} n=64 -1.008689 + 0.0028 1.0028 2.0028..., 61.0028 62.0028 63.0028 +SCALE == +Qcur-5{64, 16, 7, 1} n=64 -1.008689 + 0.0028 1.0028 2.0028..., 61.0028 62.0028 63.0028 + + +wqkv-5{1536, 7, 1, 1} n=1536 -6.933201 + 0.0025 1.0025 2.0025..., 1533.0024 1534.0024 1535.0024 +VIEW == +wqkv-5 (view){64, 7, 4, 1} n=1536 -0.211188 + 0.0227 1.0227 2.0227..., 61.0227 62.0227 63.0227 + + +wqkv-5 (view){64, 7, 4, 1} n=64 -0.211188 + 0.0227 1.0227 2.0227..., 61.0227 62.0227 63.0227 +CONT == +Kcur-5{64, 7, 4, 1} n=64 -0.211188 + 0.0227 1.0227 2.0227..., 61.0227 62.0227 63.0227 + + +Kcur-5{64, 7, 4, 1} n=64 -1.964773 + 0.2109 1.2109 2.2109..., 61.2109 62.2109 63.2109 +RMS_NORM == +norm-5{64, 7, 4, 1} n=64 -1.964773 + 0.2109 1.2109 2.2109..., 61.2109 62.2109 63.2109 + + +norm-5{64, 7, 4, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 +blk.5.attn_k_norm.weight{64, 1, 1, 1} n=64 125.694138 + 0.0374 1.0374 2.0374..., 61.0374 62.0374 63.0374 +MUL == +Kcur-5{64, 7, 4, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 + + +Kcur-5{64, 7, 4, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 +RESHAPE == +Kcur-5 (reshaped){64, 4, 7, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 + + +Kcur-5 (reshaped){64, 4, 7, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_231{64, 4, 7, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 + + +node_231{64, 4, 7, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 +REPEAT == +node_232{64, 4, 28, 1} n=64 -1.498763 + 0.0079 1.0079 2.0079..., 61.0079 62.0079 63.0079 + + +wqkv-5{1536, 7, 1, 1} n=1536 -6.933201 + 0.0025 1.0025 2.0025..., 1533.0024 1534.0024 1535.0024 +VIEW == +wqkv-5 (view){64, 7, 4, 1} n=1536 -0.851659 + -0.0234 0.9766 1.9766..., 60.9766 61.9766 62.9766 + + +wqkv-5 (view){64, 7, 4, 1} n=64 -0.851659 + -0.0234 0.9766 1.9766..., 60.9766 61.9766 62.9766 +CONT == +Vcur-5{64, 7, 4, 1} n=64 -0.851659 + -0.0234 0.9766 1.9766..., 60.9766 61.9766 62.9766 + + +Vcur-5{64, 7, 4, 1} n=64 -0.851659 + -0.0234 0.9766 1.9766..., 60.9766 61.9766 62.9766 +REPEAT == +node_235{64, 7, 16, 1} n=64 -0.851659 + -0.0234 0.9766 1.9766..., 60.9766 61.9766 62.9766 + + +cache_v_l5{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-5{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l5{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-5{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-5{64, 16, 7, 1} n=64 -1.008689 + 0.0028 1.0028 2.0028..., 61.0028 62.0028 63.0028 +PERMUTE == +q-5{64, 7, 16, 1} n=64 -1.008689 + 0.0028 1.0028 2.0028..., 61.0028 62.0028 63.0028 + + +k-5{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-5{64, 7, 16, 1} n=64 -1.008689 + 0.0028 1.0028 2.0028..., 61.0028 62.0028 63.0028 +MUL_MAT == +kq-5{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-5{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-5{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-5{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-5{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-5{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-5{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-5{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-5{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-5{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.5.attn_output.weight{1024, 1280, 1, 1} n=1024 -0.000000 + -0.0053 -0.0053 -0.0053..., -0.0105 -0.0105 -0.0105 +kqv_merged_cont-5{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-5{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-5{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 +l_out-4{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 +ADD == +node_245{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 + + +node_245{1280, 7, 1, 1} n=1280 -1421.083740 + -4.3969 -3.3969 -2.3969..., 1272.6030 1273.6030 1274.6030 +RMS_NORM == +norm-5{1280, 7, 1, 1} n=1280 -33.967426 + -0.1051 0.8949 1.8949..., 1276.8949 1277.8949 1278.8949 + + +norm-5{1280, 7, 1, 1} n=1280 0.477283 + -0.0243 0.9757 1.9757..., 1276.9757 1277.9757 1278.9757 +blk.5.ffn_norm.weight{1280, 1, 1, 1} n=1280 455.270050 + 0.2314 1.2314 2.2314..., 1277.2314 1278.2314 1279.2314 +MUL == +ffn_norm-5{1280, 7, 1, 1} n=1280 0.477283 + -0.0243 0.9757 1.9757..., 1276.9757 1277.9757 1278.9757 + + +ffn_norm-5{1280, 7, 1, 1} n=1280 26.799025 + -1.3656 -0.3656 0.6344..., 1275.6344 1276.6344 1277.6344 +RMS_NORM == +norm-5{1280, 7, 1, 1} n=1280 26.799025 + -1.3656 -0.3656 0.6344..., 1275.6344 1276.6344 1277.6344 + + +norm-5{1280, 7, 1, 1} n=1280 8.173255 + -0.3160 0.6840 1.6840..., 1276.6840 1277.6840 1278.6840 +blk.5.ffn_norm.weight{1280, 1, 1, 1} n=1280 455.270050 + 0.2314 1.2314 2.2314..., 1277.2314 1278.2314 1279.2314 +MUL == +ffn_norm-5{1280, 7, 1, 1} n=1280 8.173255 + -0.3160 0.6840 1.6840..., 1276.6840 1277.6840 1278.6840 + + +blk.5.ffn_up.weight{1280, 4096, 1, 1} n=1280 0.000000 + 0.0504 0.0504 0.0504..., 0.1162 0.1163 0.1163 +ffn_norm-5{1280, 7, 1, 1} n=1280 8.173255 + -0.3160 0.6840 1.6840..., 1276.6840 1277.6840 1278.6840 +MUL_MAT == +ffn_silu-5{4096, 7, 1, 1} n=1280 562.061157 + 0.2484 1.2484 2.2484..., 4093.2485 4094.2485 4095.2485 + + +ffn_silu-5{4096, 7, 1, 1} n=4096 562.061157 + 0.2484 1.2484 2.2484..., 4093.2485 4094.2485 4095.2485 +VIEW == +ffn_up-5 (view){2048, 7, 1, 1} n=4096 587.145508 + 0.2484 1.2484 2.2484..., 2045.2484 2046.2484 2047.2484 + + +ffn_up-5 (view){2048, 7, 1, 1} n=2048 587.145508 + 0.2484 1.2484 2.2484..., 2045.2484 2046.2484 2047.2484 +CONT == +ffn_up-5 (view) (cont){2048, 7, 1, 1} n=2048 587.145508 + 0.2484 1.2484 2.2484..., 2045.2484 2046.2484 2047.2484 + + +ffn_up-5 (view) (cont){2048, 7, 1, 1} n=2048 573.041504 + 0.1395 1.1395 2.1395..., 2045.1395 2046.1395 2047.1395 +SILU == +node_253{2048, 7, 1, 1} n=2048 573.041504 + 0.1395 1.1395 2.1395..., 2045.1395 2046.1395 2047.1395 + + +ffn_silu-5{4096, 7, 1, 1} n=4096 562.061157 + 0.2484 1.2484 2.2484..., 4093.2485 4094.2485 4095.2485 +VIEW == +ffn_up-5 (view){2048, 7, 1, 1} n=4096 -25.084682 + -0.2336 0.7664 1.7664..., 2044.7664 2045.7664 2046.7664 + + +ffn_up-5 (view){2048, 7, 1, 1} n=2048 -25.084682 + -0.2336 0.7664 1.7664..., 2044.7664 2045.7664 2046.7664 +CONT == +ffn_up-5 (view) (cont){2048, 7, 1, 1} n=2048 -25.084682 + -0.2336 0.7664 1.7664..., 2044.7664 2045.7664 2046.7664 + + +node_253{2048, 7, 1, 1} n=2048 5.085576 + -0.0326 0.9674 1.9674..., 2044.9674 2045.9674 2046.9674 +ffn_up-5 (view) (cont){2048, 7, 1, 1} n=2048 -25.084682 + -0.2336 0.7664 1.7664..., 2044.7664 2045.7664 2046.7664 +MUL == +ffn_mul-5{2048, 7, 1, 1} n=2048 5.085576 + -0.0326 0.9674 1.9674..., 2044.9674 2045.9674 2046.9674 + + +blk.5.ffn_down.weight{2048, 1280, 1, 1} n=2048 -0.000000 + -0.0385 -0.0385 -0.0386..., -0.1537 -0.1538 -0.1539 +ffn_mul-5{2048, 7, 1, 1} n=2048 5.085576 + -0.0326 0.9674 1.9674..., 2044.9674 2045.9674 2046.9674 +MUL_MAT == +ffn_out-5{1280, 7, 1, 1} n=2048 -14.446003 + -0.2989 0.7011 1.7011..., 1276.7012 1277.7012 1278.7012 + + +node_245{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 +ffn_out-5{1280, 7, 1, 1} n=1280 -14.446003 + -0.2989 0.7011 1.7011..., 1276.7012 1277.7012 1278.7012 +ADD == +l_out-5{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 + + +l_out-5{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 +RMS_NORM == +norm-6{1280, 7, 1, 1} n=1280 -34.294418 + -0.1122 0.8878 1.8878..., 1276.8878 1277.8878 1278.8878 + + +norm-6{1280, 7, 1, 1} n=1280 -2.535349 + -0.0488 0.9512 1.9512..., 1276.9512 1277.9512 1278.9512 +blk.6.attn_norm.weight{1280, 1, 1, 1} n=1280 234.621826 + 0.4352 1.4352 2.4352..., 1277.4352 1278.4352 1279.4352 +MUL == +attn_norm-6{1280, 7, 1, 1} n=1280 -2.535349 + -0.0488 0.9512 1.9512..., 1276.9512 1277.9512 1278.9512 + + +blk.6.attn_qkv.weight{1280, 1536, 1, 1} n=1280 -0.000008 + -0.1671 -0.1672 -0.1674..., -0.3960 -0.3962 -0.3965 +attn_norm-6{1280, 7, 1, 1} n=1280 -2.535349 + -0.0488 0.9512 1.9512..., 1276.9512 1277.9512 1278.9512 +MUL_MAT == +wqkv-6{1536, 7, 1, 1} n=1280 0.738848 + -0.0424 0.9576 1.9576..., 1532.9575 1533.9575 1534.9575 + + +wqkv-6{1536, 7, 1, 1} n=1536 0.738848 + -0.0424 0.9576 1.9576..., 1532.9575 1533.9575 1534.9575 +VIEW == +wqkv-6 (view){64, 7, 16, 1} n=1536 1.665520 + -0.0424 0.9576 1.9576..., 60.9576 61.9576 62.9576 + + +wqkv-6 (view){64, 7, 16, 1} n=64 1.665520 + -0.0424 0.9576 1.9576..., 60.9576 61.9576 62.9576 +CONT == +Qcur-6{64, 7, 16, 1} n=64 1.665520 + -0.0424 0.9576 1.9576..., 60.9576 61.9576 62.9576 + + +Qcur-6{64, 7, 16, 1} n=64 6.937180 + -0.1768 0.8232 1.8232..., 60.8232 61.8232 62.8232 +RMS_NORM == +norm-6{64, 7, 16, 1} n=64 6.937180 + -0.1768 0.8232 1.8232..., 60.8232 61.8232 62.8232 + + +norm-6{64, 7, 16, 1} n=64 5.766212 + -0.3926 0.6074 1.6074..., 60.6074 61.6074 62.6074 +blk.6.attn_q_norm.weight{64, 1, 1, 1} n=64 126.051430 + 2.2204 3.2204 4.2204..., 63.2204 64.2204 65.2204 +MUL == +Qcur-6{64, 7, 16, 1} n=64 5.766212 + -0.3926 0.6074 1.6074..., 60.6074 61.6074 62.6074 + + +Qcur-6{64, 7, 16, 1} n=64 5.766212 + -0.3926 0.6074 1.6074..., 60.6074 61.6074 62.6074 +RESHAPE == +Qcur-6 (reshaped){64, 16, 7, 1} n=64 5.766212 + -0.3926 0.6074 1.6074..., 60.6074 61.6074 62.6074 + + +Qcur-6 (reshaped){64, 16, 7, 1} n=64 5.766212 + -0.3926 0.6074 1.6074..., 60.6074 61.6074 62.6074 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-6{64, 16, 7, 1} n=64 5.766212 + -0.3926 0.6074 1.6074..., 60.6074 61.6074 62.6074 + + +Qcur-6{64, 16, 7, 1} n=64 0.720777 + -0.0491 0.9509 1.9509..., 60.9509 61.9509 62.9509 +SCALE == +Qcur-6{64, 16, 7, 1} n=64 0.720777 + -0.0491 0.9509 1.9509..., 60.9509 61.9509 62.9509 + + +wqkv-6{1536, 7, 1, 1} n=1536 0.738848 + -0.0424 0.9576 1.9576..., 1532.9575 1533.9575 1534.9575 +VIEW == +wqkv-6 (view){64, 7, 4, 1} n=1536 1.986943 + -0.0083 0.9917 1.9917..., 60.9917 61.9917 62.9917 + + +wqkv-6 (view){64, 7, 4, 1} n=64 1.986943 + -0.0083 0.9917 1.9917..., 60.9917 61.9917 62.9917 +CONT == +Kcur-6{64, 7, 4, 1} n=64 1.986943 + -0.0083 0.9917 1.9917..., 60.9917 61.9917 62.9917 + + +Kcur-6{64, 7, 4, 1} n=64 12.948477 + -0.0541 0.9459 1.9459..., 60.9459 61.9459 62.9459 +RMS_NORM == +norm-6{64, 7, 4, 1} n=64 12.948477 + -0.0541 0.9459 1.9459..., 60.9459 61.9459 62.9459 + + +norm-6{64, 7, 4, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 +blk.6.attn_k_norm.weight{64, 1, 1, 1} n=64 130.857529 + 2.5354 3.5354 4.5354..., 63.5354 64.5354 65.5354 +MUL == +Kcur-6{64, 7, 4, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 + + +Kcur-6{64, 7, 4, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 +RESHAPE == +Kcur-6 (reshaped){64, 4, 7, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 + + +Kcur-6 (reshaped){64, 4, 7, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_274{64, 4, 7, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 + + +node_274{64, 4, 7, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 +REPEAT == +node_275{64, 4, 28, 1} n=64 27.534906 + -0.1371 0.8629 1.8629..., 60.8629 61.8629 62.8629 + + +wqkv-6{1536, 7, 1, 1} n=1536 0.738848 + -0.0424 0.9576 1.9576..., 1532.9575 1533.9575 1534.9575 +VIEW == +wqkv-6 (view){64, 7, 4, 1} n=1536 0.885298 + -0.0301 0.9699 1.9699..., 60.9699 61.9699 62.9699 + + +wqkv-6 (view){64, 7, 4, 1} n=64 0.885298 + -0.0301 0.9699 1.9699..., 60.9699 61.9699 62.9699 +CONT == +Vcur-6{64, 7, 4, 1} n=64 0.885298 + -0.0301 0.9699 1.9699..., 60.9699 61.9699 62.9699 + + +Vcur-6{64, 7, 4, 1} n=64 0.885298 + -0.0301 0.9699 1.9699..., 60.9699 61.9699 62.9699 +REPEAT == +node_278{64, 7, 16, 1} n=64 0.885298 + -0.0301 0.9699 1.9699..., 60.9699 61.9699 62.9699 + + +cache_v_l6{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-6{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l6{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-6{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-6{64, 16, 7, 1} n=64 0.720777 + -0.0491 0.9509 1.9509..., 60.9509 61.9509 62.9509 +PERMUTE == +q-6{64, 7, 16, 1} n=64 0.720777 + -0.0491 0.9509 1.9509..., 60.9509 61.9509 62.9509 + + +k-6{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-6{64, 7, 16, 1} n=64 0.720777 + -0.0491 0.9509 1.9509..., 60.9509 61.9509 62.9509 +MUL_MAT == +kq-6{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-6{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-6{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-6{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-6{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-6{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-6{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-6{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-6{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-6{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.6.attn_output.weight{1024, 1280, 1, 1} n=1024 0.000003 + -0.3416 -0.3418 -0.3420..., -0.6816 -0.6821 -0.6826 +kqv_merged_cont-6{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-6{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-6{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 +l_out-5{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 +ADD == +node_288{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 + + +node_288{1280, 7, 1, 1} n=1280 -1435.531860 + -4.6958 -3.6958 -2.6958..., 1272.3042 1273.3042 1274.3042 +RMS_NORM == +norm-6{1280, 7, 1, 1} n=1280 -34.294418 + -0.1122 0.8878 1.8878..., 1276.8878 1277.8878 1278.8878 + + +norm-6{1280, 7, 1, 1} n=1280 -2.334821 + -0.0282 0.9718 1.9718..., 1276.9718 1277.9718 1278.9718 +blk.6.ffn_norm.weight{1280, 1, 1, 1} n=1280 478.438324 + 0.2510 1.2510 2.2510..., 1277.2510 1278.2510 1279.2510 +MUL == +ffn_norm-6{1280, 7, 1, 1} n=1280 -2.334821 + -0.0282 0.9718 1.9718..., 1276.9718 1277.9718 1278.9718 + + +ffn_norm-6{1280, 7, 1, 1} n=1280 -29.175863 + -0.3519 0.6481 1.6481..., 1276.6482 1277.6482 1278.6482 +RMS_NORM == +norm-6{1280, 7, 1, 1} n=1280 -29.175863 + -0.3519 0.6481 1.6481..., 1276.6482 1277.6482 1278.6482 + + +norm-6{1280, 7, 1, 1} n=1280 -1.018026 + -0.0883 0.9117 1.9117..., 1276.9117 1277.9117 1278.9117 +blk.6.ffn_norm.weight{1280, 1, 1, 1} n=1280 478.438324 + 0.2510 1.2510 2.2510..., 1277.2510 1278.2510 1279.2510 +MUL == +ffn_norm-6{1280, 7, 1, 1} n=1280 -1.018026 + -0.0883 0.9117 1.9117..., 1276.9117 1277.9117 1278.9117 + + +blk.6.ffn_up.weight{1280, 5120, 1, 1} n=1280 0.000000 + 0.0445 0.0445 0.0446..., 0.1044 0.1045 0.1046 +ffn_norm-6{1280, 7, 1, 1} n=1280 -1.018026 + -0.0883 0.9117 1.9117..., 1276.9117 1277.9117 1278.9117 +MUL_MAT == +ffn_silu-6{5120, 7, 1, 1} n=1280 -127.244637 + -0.0939 0.9061 1.9061..., 5116.9062 5117.9062 5118.9062 + + +ffn_silu-6{5120, 7, 1, 1} n=5120 -127.244637 + -0.0939 0.9061 1.9061..., 5116.9062 5117.9062 5118.9062 +VIEW == +ffn_up-6 (view){2560, 7, 1, 1} n=5120 -116.799065 + -0.0939 0.9061 1.9061..., 2556.9060 2557.9060 2558.9060 + + +ffn_up-6 (view){2560, 7, 1, 1} n=2560 -116.799065 + -0.0939 0.9061 1.9061..., 2556.9060 2557.9060 2558.9060 +CONT == +ffn_up-6 (view) (cont){2560, 7, 1, 1} n=2560 -116.799065 + -0.0939 0.9061 1.9061..., 2556.9060 2557.9060 2558.9060 + + +ffn_up-6 (view) (cont){2560, 7, 1, 1} n=2560 -20.447285 + -0.0448 0.9552 1.9552..., 2556.9553 2557.9553 2558.9553 +SILU == +node_296{2560, 7, 1, 1} n=2560 -20.447285 + -0.0448 0.9552 1.9552..., 2556.9553 2557.9553 2558.9553 + + +ffn_silu-6{5120, 7, 1, 1} n=5120 -127.244637 + -0.0939 0.9061 1.9061..., 5116.9062 5117.9062 5118.9062 +VIEW == +ffn_up-6 (view){2560, 7, 1, 1} n=5120 -10.445560 + -0.0690 0.9310 1.9310..., 2556.9309 2557.9309 2558.9309 + + +ffn_up-6 (view){2560, 7, 1, 1} n=2560 -10.445560 + -0.0690 0.9310 1.9310..., 2556.9309 2557.9309 2558.9309 +CONT == +ffn_up-6 (view) (cont){2560, 7, 1, 1} n=2560 -10.445560 + -0.0690 0.9310 1.9310..., 2556.9309 2557.9309 2558.9309 + + +node_296{2560, 7, 1, 1} n=2560 0.291427 + 0.0031 1.0031 2.0031..., 2557.0032 2558.0032 2559.0032 +ffn_up-6 (view) (cont){2560, 7, 1, 1} n=2560 -10.445560 + -0.0690 0.9310 1.9310..., 2556.9309 2557.9309 2558.9309 +MUL == +ffn_mul-6{2560, 7, 1, 1} n=2560 0.291427 + 0.0031 1.0031 2.0031..., 2557.0032 2558.0032 2559.0032 + + +blk.6.ffn_down.weight{2560, 1280, 1, 1} n=2560 -0.000000 + -0.0090 -0.0090 -0.0090..., -0.0516 -0.0516 -0.0516 +ffn_mul-6{2560, 7, 1, 1} n=2560 0.291427 + 0.0031 1.0031 2.0031..., 2557.0032 2558.0032 2559.0032 +MUL_MAT == +ffn_out-6{1280, 7, 1, 1} n=2560 2.848215 + 0.0500 1.0500 2.0500..., 1277.0500 1278.0500 1279.0500 + + +node_288{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 +ffn_out-6{1280, 7, 1, 1} n=1280 2.848215 + 0.0500 1.0500 2.0500..., 1277.0500 1278.0500 1279.0500 +ADD == +l_out-6{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 + + +l_out-6{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 +RMS_NORM == +norm-7{1280, 7, 1, 1} n=1280 -34.223675 + -0.1110 0.8890 1.8890..., 1276.8890 1277.8890 1278.8890 + + +norm-7{1280, 7, 1, 1} n=1280 -4.651794 + -0.0620 0.9380 1.9380..., 1276.9380 1277.9380 1278.9380 +blk.7.attn_norm.weight{1280, 1, 1, 1} n=1280 336.079346 + 0.5588 1.5588 2.5588..., 1277.5588 1278.5588 1279.5588 +MUL == +attn_norm-7{1280, 7, 1, 1} n=1280 -4.651794 + -0.0620 0.9380 1.9380..., 1276.9380 1277.9380 1278.9380 + + +blk.7.attn_qkv.weight{1280, 1536, 1, 1} n=1280 -0.000015 + 0.0088 0.0088 0.0088..., 0.0215 0.0215 0.0215 +attn_norm-7{1280, 7, 1, 1} n=1280 -4.651794 + -0.0620 0.9380 1.9380..., 1276.9380 1277.9380 1278.9380 +MUL_MAT == +wqkv-7{1536, 7, 1, 1} n=1280 -2.578517 + 0.5758 1.5758 2.5758..., 1533.5758 1534.5758 1535.5758 + + +wqkv-7{1536, 7, 1, 1} n=1536 -2.578517 + 0.5758 1.5758 2.5758..., 1533.5758 1534.5758 1535.5758 +VIEW == +wqkv-7 (view){64, 7, 16, 1} n=1536 0.360070 + 0.5758 1.5758 2.5758..., 61.5758 62.5758 63.5758 + + +wqkv-7 (view){64, 7, 16, 1} n=64 0.360070 + 0.5758 1.5758 2.5758..., 61.5758 62.5758 63.5758 +CONT == +Qcur-7{64, 7, 16, 1} n=64 0.360070 + 0.5758 1.5758 2.5758..., 61.5758 62.5758 63.5758 + + +Qcur-7{64, 7, 16, 1} n=64 2.114490 + 3.3811 4.3811 5.3811..., 64.3811 65.3811 66.3811 +RMS_NORM == +norm-7{64, 7, 16, 1} n=64 2.114490 + 3.3811 4.3811 5.3811..., 64.3811 65.3811 66.3811 + + +norm-7{64, 7, 16, 1} n=64 -7.313521 + 0.0344 1.0344 2.0344..., 61.0344 62.0344 63.0344 +blk.7.attn_q_norm.weight{64, 1, 1, 1} n=64 117.833115 + 0.0102 1.0102 2.0102..., 61.0102 62.0102 63.0102 +MUL == +Qcur-7{64, 7, 16, 1} n=64 -7.313521 + 0.0344 1.0344 2.0344..., 61.0344 62.0344 63.0344 + + +Qcur-7{64, 7, 16, 1} n=64 -7.313521 + 0.0344 1.0344 2.0344..., 61.0344 62.0344 63.0344 +RESHAPE == +Qcur-7 (reshaped){64, 16, 7, 1} n=64 -7.313521 + 0.0344 1.0344 2.0344..., 61.0344 62.0344 63.0344 + + +Qcur-7 (reshaped){64, 16, 7, 1} n=64 -7.313521 + 0.0344 1.0344 2.0344..., 61.0344 62.0344 63.0344 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-7{64, 16, 7, 1} n=64 -7.313521 + 0.0344 1.0344 2.0344..., 61.0344 62.0344 63.0344 + + +Qcur-7{64, 16, 7, 1} n=64 -0.914190 + 0.0043 1.0043 2.0043..., 61.0043 62.0043 63.0043 +SCALE == +Qcur-7{64, 16, 7, 1} n=64 -0.914190 + 0.0043 1.0043 2.0043..., 61.0043 62.0043 63.0043 + + +wqkv-7{1536, 7, 1, 1} n=1536 -2.578517 + 0.5758 1.5758 2.5758..., 1533.5758 1534.5758 1535.5758 +VIEW == +wqkv-7 (view){64, 7, 4, 1} n=1536 -0.319815 + 0.0735 1.0735 2.0735..., 61.0735 62.0735 63.0735 + + +wqkv-7 (view){64, 7, 4, 1} n=64 -0.319815 + 0.0735 1.0735 2.0735..., 61.0735 62.0735 63.0735 +CONT == +Kcur-7{64, 7, 4, 1} n=64 -0.319815 + 0.0735 1.0735 2.0735..., 61.0735 62.0735 63.0735 + + +Kcur-7{64, 7, 4, 1} n=64 -1.941767 + 0.4461 1.4461 2.4461..., 61.4461 62.4461 63.4461 +RMS_NORM == +norm-7{64, 7, 4, 1} n=64 -1.941767 + 0.4461 1.4461 2.4461..., 61.4461 62.4461 63.4461 + + +norm-7{64, 7, 4, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 +blk.7.attn_k_norm.weight{64, 1, 1, 1} n=64 114.663406 + -0.0423 0.9577 1.9577..., 60.9577 61.9577 62.9577 +MUL == +Kcur-7{64, 7, 4, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 + + +Kcur-7{64, 7, 4, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 +RESHAPE == +Kcur-7 (reshaped){64, 4, 7, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 + + +Kcur-7 (reshaped){64, 4, 7, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_317{64, 4, 7, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 + + +node_317{64, 4, 7, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 +REPEAT == +node_318{64, 4, 28, 1} n=64 1.407578 + -0.0189 0.9811 1.9811..., 60.9811 61.9811 62.9811 + + +wqkv-7{1536, 7, 1, 1} n=1536 -2.578517 + 0.5758 1.5758 2.5758..., 1533.5758 1534.5758 1535.5758 +VIEW == +wqkv-7 (view){64, 7, 4, 1} n=1536 0.426329 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +wqkv-7 (view){64, 7, 4, 1} n=64 0.426329 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 +CONT == +Vcur-7{64, 7, 4, 1} n=64 0.426329 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +Vcur-7{64, 7, 4, 1} n=64 0.426329 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 +REPEAT == +node_321{64, 7, 16, 1} n=64 0.426329 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +cache_v_l7{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-7{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l7{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-7{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-7{64, 16, 7, 1} n=64 -0.914190 + 0.0043 1.0043 2.0043..., 61.0043 62.0043 63.0043 +PERMUTE == +q-7{64, 7, 16, 1} n=64 -0.914190 + 0.0043 1.0043 2.0043..., 61.0043 62.0043 63.0043 + + +k-7{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-7{64, 7, 16, 1} n=64 -0.914190 + 0.0043 1.0043 2.0043..., 61.0043 62.0043 63.0043 +MUL_MAT == +kq-7{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-7{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-7{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-7{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-7{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-7{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-7{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-7{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-7{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-7{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.7.attn_output.weight{1024, 1280, 1, 1} n=1024 0.000041 + 0.0262 0.0263 0.0263..., 0.0524 0.0524 0.0525 +kqv_merged_cont-7{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-7{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-7{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 +l_out-6{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 +ADD == +node_331{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 + + +node_331{1280, 7, 1, 1} n=1280 -1432.683960 + -4.6458 -3.6458 -2.6458..., 1272.3542 1273.3542 1274.3542 +RMS_NORM == +norm-7{1280, 7, 1, 1} n=1280 -34.223675 + -0.1110 0.8890 1.8890..., 1276.8890 1277.8890 1278.8890 + + +norm-7{1280, 7, 1, 1} n=1280 -4.994317 + -0.0279 0.9721 1.9721..., 1276.9720 1277.9720 1278.9720 +blk.7.ffn_norm.weight{1280, 1, 1, 1} n=1280 507.271667 + 0.2514 1.2514 2.2514..., 1277.2515 1278.2515 1279.2515 +MUL == +ffn_norm-7{1280, 7, 1, 1} n=1280 -4.994317 + -0.0279 0.9721 1.9721..., 1276.9720 1277.9720 1278.9720 + + +ffn_norm-7{1280, 7, 1, 1} n=1280 -32.358723 + -0.1808 0.8192 1.8192..., 1276.8192 1277.8192 1278.8192 +RMS_NORM == +norm-7{1280, 7, 1, 1} n=1280 -32.358723 + -0.1808 0.8192 1.8192..., 1276.8192 1277.8192 1278.8192 + + +norm-7{1280, 7, 1, 1} n=1280 -4.402473 + -0.0455 0.9545 1.9545..., 1276.9546 1277.9546 1278.9546 +blk.7.ffn_norm.weight{1280, 1, 1, 1} n=1280 507.271667 + 0.2514 1.2514 2.2514..., 1277.2515 1278.2515 1279.2515 +MUL == +ffn_norm-7{1280, 7, 1, 1} n=1280 -4.402473 + -0.0455 0.9545 1.9545..., 1276.9546 1277.9546 1278.9546 + + +blk.7.ffn_up.weight{1280, 5632, 1, 1} n=1280 0.000000 + -0.0254 -0.0254 -0.0254..., -0.0585 -0.0585 -0.0586 +ffn_norm-7{1280, 7, 1, 1} n=1280 -4.402473 + -0.0455 0.9545 1.9545..., 1276.9546 1277.9546 1278.9546 +MUL_MAT == +ffn_silu-7{5632, 7, 1, 1} n=1280 -532.842163 + -0.1681 0.8319 1.8319..., 5628.8320 5629.8320 5630.8320 + + +ffn_silu-7{5632, 7, 1, 1} n=5632 -532.842163 + -0.1681 0.8319 1.8319..., 5628.8320 5629.8320 5630.8320 +VIEW == +ffn_up-7 (view){2816, 7, 1, 1} n=5632 -536.338257 + -0.1681 0.8319 1.8319..., 2812.8320 2813.8320 2814.8320 + + +ffn_up-7 (view){2816, 7, 1, 1} n=2816 -536.338257 + -0.1681 0.8319 1.8319..., 2812.8320 2813.8320 2814.8320 +CONT == +ffn_up-7 (view) (cont){2816, 7, 1, 1} n=2816 -536.338257 + -0.1681 0.8319 1.8319..., 2812.8320 2813.8320 2814.8320 + + +ffn_up-7 (view) (cont){2816, 7, 1, 1} n=2816 -207.470413 + -0.0770 0.9230 1.9230..., 2812.9229 2813.9229 2814.9229 +SILU == +node_339{2816, 7, 1, 1} n=2816 -207.470413 + -0.0770 0.9230 1.9230..., 2812.9229 2813.9229 2814.9229 + + +ffn_silu-7{5632, 7, 1, 1} n=5632 -532.842163 + -0.1681 0.8319 1.8319..., 5628.8320 5629.8320 5630.8320 +VIEW == +ffn_up-7 (view){2816, 7, 1, 1} n=5632 3.497948 + -0.0221 0.9779 1.9779..., 2812.9778 2813.9778 2814.9778 + + +ffn_up-7 (view){2816, 7, 1, 1} n=2816 3.497948 + -0.0221 0.9779 1.9779..., 2812.9778 2813.9778 2814.9778 +CONT == +ffn_up-7 (view) (cont){2816, 7, 1, 1} n=2816 3.497948 + -0.0221 0.9779 1.9779..., 2812.9778 2813.9778 2814.9778 + + +node_339{2816, 7, 1, 1} n=2816 -0.248166 + 0.0017 1.0017 2.0017..., 2813.0017 2814.0017 2815.0017 +ffn_up-7 (view) (cont){2816, 7, 1, 1} n=2816 3.497948 + -0.0221 0.9779 1.9779..., 2812.9778 2813.9778 2814.9778 +MUL == +ffn_mul-7{2816, 7, 1, 1} n=2816 -0.248166 + 0.0017 1.0017 2.0017..., 2813.0017 2814.0017 2815.0017 + + +blk.7.ffn_down.weight{2816, 1280, 1, 1} n=2816 -0.000002 + 0.0101 0.0101 0.0101..., 0.0648 0.0648 0.0649 +ffn_mul-7{2816, 7, 1, 1} n=2816 -0.248166 + 0.0017 1.0017 2.0017..., 2813.0017 2814.0017 2815.0017 +MUL_MAT == +ffn_out-7{1280, 7, 1, 1} n=2816 -0.733983 + 0.0082 1.0082 2.0082..., 1277.0082 1278.0082 1279.0082 + + +node_331{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 +ffn_out-7{1280, 7, 1, 1} n=1280 -0.733983 + 0.0082 1.0082 2.0082..., 1277.0082 1278.0082 1279.0082 +ADD == +l_out-7{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 + + +l_out-7{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 +RMS_NORM == +norm-8{1280, 7, 1, 1} n=1280 -34.246212 + -0.1108 0.8892 1.8892..., 1276.8892 1277.8892 1278.8892 + + +norm-8{1280, 7, 1, 1} n=1280 -5.087716 + -0.0606 0.9394 1.9394..., 1276.9395 1277.9395 1278.9395 +blk.8.attn_norm.weight{1280, 1, 1, 1} n=1280 450.434143 + 0.5467 1.5467 2.5467..., 1277.5468 1278.5468 1279.5468 +MUL == +attn_norm-8{1280, 7, 1, 1} n=1280 -5.087716 + -0.0606 0.9394 1.9394..., 1276.9395 1277.9395 1278.9395 + + +blk.8.attn_qkv.weight{1280, 1536, 1, 1} n=1280 -0.000000 + 0.0707 0.0708 0.0709..., 0.1724 0.1725 0.1726 +attn_norm-8{1280, 7, 1, 1} n=1280 -5.087716 + -0.0606 0.9394 1.9394..., 1276.9395 1277.9395 1278.9395 +MUL_MAT == +wqkv-8{1536, 7, 1, 1} n=1280 -8.300745 + 0.0684 1.0684 2.0684..., 1533.0684 1534.0684 1535.0684 + + +wqkv-8{1536, 7, 1, 1} n=1536 -8.300745 + 0.0684 1.0684 2.0684..., 1533.0684 1534.0684 1535.0684 +VIEW == +wqkv-8 (view){64, 7, 16, 1} n=1536 -3.309173 + 0.0684 1.0684 2.0684..., 61.0684 62.0684 63.0684 + + +wqkv-8 (view){64, 7, 16, 1} n=64 -3.309173 + 0.0684 1.0684 2.0684..., 61.0684 62.0684 63.0684 +CONT == +Qcur-8{64, 7, 16, 1} n=64 -3.309173 + 0.0684 1.0684 2.0684..., 61.0684 62.0684 63.0684 + + +Qcur-8{64, 7, 16, 1} n=64 -11.641029 + 0.2405 1.2405 2.2405..., 61.2405 62.2405 63.2405 +RMS_NORM == +norm-8{64, 7, 16, 1} n=64 -11.641029 + 0.2405 1.2405 2.2405..., 61.2405 62.2405 63.2405 + + +norm-8{64, 7, 16, 1} n=64 -5.791919 + 0.3599 1.3599 2.3599..., 61.3599 62.3599 63.3599 +blk.8.attn_q_norm.weight{64, 1, 1, 1} n=64 96.824348 + 1.4961 2.4961 3.4961..., 62.4961 63.4961 64.4961 +MUL == +Qcur-8{64, 7, 16, 1} n=64 -5.791919 + 0.3599 1.3599 2.3599..., 61.3599 62.3599 63.3599 + + +Qcur-8{64, 7, 16, 1} n=64 -5.791919 + 0.3599 1.3599 2.3599..., 61.3599 62.3599 63.3599 +RESHAPE == +Qcur-8 (reshaped){64, 16, 7, 1} n=64 -5.791919 + 0.3599 1.3599 2.3599..., 61.3599 62.3599 63.3599 + + +Qcur-8 (reshaped){64, 16, 7, 1} n=64 -5.791919 + 0.3599 1.3599 2.3599..., 61.3599 62.3599 63.3599 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-8{64, 16, 7, 1} n=64 -5.791919 + 0.3599 1.3599 2.3599..., 61.3599 62.3599 63.3599 + + +Qcur-8{64, 16, 7, 1} n=64 -0.723990 + 0.0450 1.0450 2.0450..., 61.0450 62.0450 63.0450 +SCALE == +Qcur-8{64, 16, 7, 1} n=64 -0.723990 + 0.0450 1.0450 2.0450..., 61.0450 62.0450 63.0450 + + +wqkv-8{1536, 7, 1, 1} n=1536 -8.300745 + 0.0684 1.0684 2.0684..., 1533.0684 1534.0684 1535.0684 +VIEW == +wqkv-8 (view){64, 7, 4, 1} n=1536 -2.219533 + 0.1768 1.1768 2.1768..., 61.1768 62.1768 63.1768 + + +wqkv-8 (view){64, 7, 4, 1} n=64 -2.219533 + 0.1768 1.1768 2.1768..., 61.1768 62.1768 63.1768 +CONT == +Kcur-8{64, 7, 4, 1} n=64 -2.219533 + 0.1768 1.1768 2.1768..., 61.1768 62.1768 63.1768 + + +Kcur-8{64, 7, 4, 1} n=64 -7.108442 + 0.5663 1.5663 2.5663..., 61.5663 62.5663 63.5663 +RMS_NORM == +norm-8{64, 7, 4, 1} n=64 -7.108442 + 0.5663 1.5663 2.5663..., 61.5663 62.5663 63.5663 + + +norm-8{64, 7, 4, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 +blk.8.attn_k_norm.weight{64, 1, 1, 1} n=64 104.032288 + 1.2442 2.2442 3.2442..., 62.2442 63.2442 64.2442 +MUL == +Kcur-8{64, 7, 4, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 + + +Kcur-8{64, 7, 4, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 +RESHAPE == +Kcur-8 (reshaped){64, 4, 7, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 + + +Kcur-8 (reshaped){64, 4, 7, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_360{64, 4, 7, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 + + +node_360{64, 4, 7, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 +REPEAT == +node_361{64, 4, 28, 1} n=64 -1.145772 + 0.7045 1.7045 2.7045..., 61.7045 62.7045 63.7045 + + +wqkv-8{1536, 7, 1, 1} n=1536 -8.300745 + 0.0684 1.0684 2.0684..., 1533.0684 1534.0684 1535.0684 +VIEW == +wqkv-8 (view){64, 7, 4, 1} n=1536 -0.512420 + -0.0296 0.9704 1.9704..., 60.9704 61.9704 62.9704 + + +wqkv-8 (view){64, 7, 4, 1} n=64 -0.512420 + -0.0296 0.9704 1.9704..., 60.9704 61.9704 62.9704 +CONT == +Vcur-8{64, 7, 4, 1} n=64 -0.512420 + -0.0296 0.9704 1.9704..., 60.9704 61.9704 62.9704 + + +Vcur-8{64, 7, 4, 1} n=64 -0.512420 + -0.0296 0.9704 1.9704..., 60.9704 61.9704 62.9704 +REPEAT == +node_364{64, 7, 16, 1} n=64 -0.512420 + -0.0296 0.9704 1.9704..., 60.9704 61.9704 62.9704 + + +cache_v_l8{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-8{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l8{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-8{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-8{64, 16, 7, 1} n=64 -0.723990 + 0.0450 1.0450 2.0450..., 61.0450 62.0450 63.0450 +PERMUTE == +q-8{64, 7, 16, 1} n=64 -0.723990 + 0.0450 1.0450 2.0450..., 61.0450 62.0450 63.0450 + + +k-8{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-8{64, 7, 16, 1} n=64 -0.723990 + 0.0450 1.0450 2.0450..., 61.0450 62.0450 63.0450 +MUL_MAT == +kq-8{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-8{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-8{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-8{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-8{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-8{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-8{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-8{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-8{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-8{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.8.attn_output.weight{1024, 1280, 1, 1} n=1024 -0.000000 + 0.1165 0.1166 0.1166..., 0.2327 0.2328 0.2329 +kqv_merged_cont-8{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-8{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-8{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 +l_out-7{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 +ADD == +node_374{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 + + +node_374{1280, 7, 1, 1} n=1280 -1433.416870 + -4.6376 -3.6376 -2.6376..., 1272.3624 1273.3624 1274.3624 +RMS_NORM == +norm-8{1280, 7, 1, 1} n=1280 -34.246212 + -0.1108 0.8892 1.8892..., 1276.8892 1277.8892 1278.8892 + + +norm-8{1280, 7, 1, 1} n=1280 -7.488029 + -0.0270 0.9730 1.9730..., 1276.9730 1277.9730 1278.9730 +blk.8.ffn_norm.weight{1280, 1, 1, 1} n=1280 510.573212 + 0.2436 1.2436 2.2436..., 1277.2437 1278.2437 1279.2437 +MUL == +ffn_norm-8{1280, 7, 1, 1} n=1280 -7.488029 + -0.0270 0.9730 1.9730..., 1276.9730 1277.9730 1278.9730 + + +ffn_norm-8{1280, 7, 1, 1} n=1280 -33.644764 + -0.1213 0.8787 1.8787..., 1276.8788 1277.8788 1278.8788 +RMS_NORM == +norm-8{1280, 7, 1, 1} n=1280 -33.644764 + -0.1213 0.8787 1.8787..., 1276.8788 1277.8788 1278.8788 + + +norm-8{1280, 7, 1, 1} n=1280 -7.314843 + -0.0295 0.9705 1.9705..., 1276.9705 1277.9705 1278.9705 +blk.8.ffn_norm.weight{1280, 1, 1, 1} n=1280 510.573212 + 0.2436 1.2436 2.2436..., 1277.2437 1278.2437 1279.2437 +MUL == +ffn_norm-8{1280, 7, 1, 1} n=1280 -7.314843 + -0.0295 0.9705 1.9705..., 1276.9705 1277.9705 1278.9705 + + +blk.8.ffn_up.weight{1280, 6144, 1, 1} n=1280 0.000000 + -0.0008 -0.0008 -0.0008..., -0.0019 -0.0019 -0.0019 +ffn_norm-8{1280, 7, 1, 1} n=1280 -7.314843 + -0.0295 0.9705 1.9705..., 1276.9705 1277.9705 1278.9705 +MUL_MAT == +ffn_silu-8{6144, 7, 1, 1} n=1280 -1035.371460 + -0.3658 0.6342 1.6342..., 6140.6343 6141.6343 6142.6343 + + +ffn_silu-8{6144, 7, 1, 1} n=6144 -1035.371460 + -0.3658 0.6342 1.6342..., 6140.6343 6141.6343 6142.6343 +VIEW == +ffn_up-8 (view){3072, 7, 1, 1} n=6144 -1027.773438 + -0.3658 0.6342 1.6342..., 3068.6343 3069.6343 3070.6343 + + +ffn_up-8 (view){3072, 7, 1, 1} n=3072 -1027.773438 + -0.3658 0.6342 1.6342..., 3068.6343 3069.6343 3070.6343 +CONT == +ffn_up-8 (view) (cont){3072, 7, 1, 1} n=3072 -1027.773438 + -0.3658 0.6342 1.6342..., 3068.6343 3069.6343 3070.6343 + + +ffn_up-8 (view) (cont){3072, 7, 1, 1} n=3072 -376.782501 + -0.1498 0.8502 1.8502..., 3068.8501 3069.8501 3070.8501 +SILU == +node_382{3072, 7, 1, 1} n=3072 -376.782501 + -0.1498 0.8502 1.8502..., 3068.8501 3069.8501 3070.8501 + + +ffn_silu-8{6144, 7, 1, 1} n=6144 -1035.371460 + -0.3658 0.6342 1.6342..., 6140.6343 6141.6343 6142.6343 +VIEW == +ffn_up-8 (view){3072, 7, 1, 1} n=6144 -7.597636 + -0.1632 0.8368 1.8368..., 3068.8367 3069.8367 3070.8367 + + +ffn_up-8 (view){3072, 7, 1, 1} n=3072 -7.597636 + -0.1632 0.8368 1.8368..., 3068.8367 3069.8367 3070.8367 +CONT == +ffn_up-8 (view) (cont){3072, 7, 1, 1} n=3072 -7.597636 + -0.1632 0.8368 1.8368..., 3068.8367 3069.8367 3070.8367 + + +node_382{3072, 7, 1, 1} n=3072 0.255748 + 0.0244 1.0244 2.0244..., 3069.0244 3070.0244 3071.0244 +ffn_up-8 (view) (cont){3072, 7, 1, 1} n=3072 -7.597636 + -0.1632 0.8368 1.8368..., 3068.8367 3069.8367 3070.8367 +MUL == +ffn_mul-8{3072, 7, 1, 1} n=3072 0.255748 + 0.0244 1.0244 2.0244..., 3069.0244 3070.0244 3071.0244 + + +blk.8.ffn_down.weight{3072, 1280, 1, 1} n=3072 -0.000000 + 0.0502 0.0502 0.0503..., 0.4009 0.4011 0.4014 +ffn_mul-8{3072, 7, 1, 1} n=3072 0.255748 + 0.0244 1.0244 2.0244..., 3069.0244 3070.0244 3071.0244 +MUL_MAT == +ffn_out-8{1280, 7, 1, 1} n=3072 2.569246 + 0.0144 1.0144 2.0144..., 1277.0144 1278.0144 1279.0144 + + +node_374{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 +ffn_out-8{1280, 7, 1, 1} n=1280 2.569246 + 0.0144 1.0144 2.0144..., 1277.0144 1278.0144 1279.0144 +ADD == +l_out-8{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 + + +l_out-8{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 +RMS_NORM == +norm-9{1280, 7, 1, 1} n=1280 -34.190929 + -0.1105 0.8895 1.8895..., 1276.8895 1277.8895 1278.8895 + + +norm-9{1280, 7, 1, 1} n=1280 -7.555387 + -0.0910 0.9090 1.9090..., 1276.9089 1277.9089 1278.9089 +blk.9.attn_norm.weight{1280, 1, 1, 1} n=1280 663.335632 + 0.8242 1.8242 2.8242..., 1277.8242 1278.8242 1279.8242 +MUL == +attn_norm-9{1280, 7, 1, 1} n=1280 -7.555387 + -0.0910 0.9090 1.9090..., 1276.9089 1277.9089 1278.9089 + + +blk.9.attn_qkv.weight{1280, 1536, 1, 1} n=1280 -0.000001 + -0.1013 -0.1013 -0.1014..., -0.2334 -0.2335 -0.2336 +attn_norm-9{1280, 7, 1, 1} n=1280 -7.555387 + -0.0910 0.9090 1.9090..., 1276.9089 1277.9089 1278.9089 +MUL_MAT == +wqkv-9{1536, 7, 1, 1} n=1280 -1.128011 + 1.3548 2.3548 3.3548..., 1534.3547 1535.3547 1536.3547 + + +wqkv-9{1536, 7, 1, 1} n=1536 -1.128011 + 1.3548 2.3548 3.3548..., 1534.3547 1535.3547 1536.3547 +VIEW == +wqkv-9 (view){64, 7, 16, 1} n=1536 -2.533897 + 1.3548 2.3548 3.3548..., 62.3548 63.3548 64.3548 + + +wqkv-9 (view){64, 7, 16, 1} n=64 -2.533897 + 1.3548 2.3548 3.3548..., 62.3548 63.3548 64.3548 +CONT == +Qcur-9{64, 7, 16, 1} n=64 -2.533897 + 1.3548 2.3548 3.3548..., 62.3548 63.3548 64.3548 + + +Qcur-9{64, 7, 16, 1} n=64 -4.981896 + 2.6636 3.6636 4.6636..., 63.6636 64.6636 65.6636 +RMS_NORM == +norm-9{64, 7, 16, 1} n=64 -4.981896 + 2.6636 3.6636 4.6636..., 63.6636 64.6636 65.6636 + + +norm-9{64, 7, 16, 1} n=64 -5.988206 + -0.7851 0.2149 1.2149..., 60.2149 61.2149 62.2149 +blk.9.attn_q_norm.weight{64, 1, 1, 1} n=64 96.932426 + -0.2947 0.7053 1.7053..., 60.7053 61.7053 62.7053 +MUL == +Qcur-9{64, 7, 16, 1} n=64 -5.988206 + -0.7851 0.2149 1.2149..., 60.2149 61.2149 62.2149 + + +Qcur-9{64, 7, 16, 1} n=64 -5.988206 + -0.7851 0.2149 1.2149..., 60.2149 61.2149 62.2149 +RESHAPE == +Qcur-9 (reshaped){64, 16, 7, 1} n=64 -5.988206 + -0.7851 0.2149 1.2149..., 60.2149 61.2149 62.2149 + + +Qcur-9 (reshaped){64, 16, 7, 1} n=64 -5.988206 + -0.7851 0.2149 1.2149..., 60.2149 61.2149 62.2149 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-9{64, 16, 7, 1} n=64 -5.988206 + -0.7851 0.2149 1.2149..., 60.2149 61.2149 62.2149 + + +Qcur-9{64, 16, 7, 1} n=64 -0.748526 + -0.0981 0.9019 1.9019..., 60.9019 61.9019 62.9019 +SCALE == +Qcur-9{64, 16, 7, 1} n=64 -0.748526 + -0.0981 0.9019 1.9019..., 60.9019 61.9019 62.9019 + + +wqkv-9{1536, 7, 1, 1} n=1536 -1.128011 + 1.3548 2.3548 3.3548..., 1534.3547 1535.3547 1536.3547 +VIEW == +wqkv-9 (view){64, 7, 4, 1} n=1536 1.520555 + 0.1296 1.1296 2.1296..., 61.1296 62.1296 63.1296 + + +wqkv-9 (view){64, 7, 4, 1} n=64 1.520555 + 0.1296 1.1296 2.1296..., 61.1296 62.1296 63.1296 +CONT == +Kcur-9{64, 7, 4, 1} n=64 1.520555 + 0.1296 1.1296 2.1296..., 61.1296 62.1296 63.1296 + + +Kcur-9{64, 7, 4, 1} n=64 2.269967 + 0.1935 1.1935 2.1935..., 61.1935 62.1935 63.1935 +RMS_NORM == +norm-9{64, 7, 4, 1} n=64 2.269967 + 0.1935 1.1935 2.1935..., 61.1935 62.1935 63.1935 + + +norm-9{64, 7, 4, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 +blk.9.attn_k_norm.weight{64, 1, 1, 1} n=64 99.619080 + 0.2990 1.2990 2.2990..., 61.2990 62.2990 63.2990 +MUL == +Kcur-9{64, 7, 4, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 + + +Kcur-9{64, 7, 4, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 +RESHAPE == +Kcur-9 (reshaped){64, 4, 7, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 + + +Kcur-9 (reshaped){64, 4, 7, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_403{64, 4, 7, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 + + +node_403{64, 4, 7, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 +REPEAT == +node_404{64, 4, 28, 1} n=64 1.364606 + 0.0579 1.0579 2.0579..., 61.0579 62.0579 63.0579 + + +wqkv-9{1536, 7, 1, 1} n=1536 -1.128011 + 1.3548 2.3548 3.3548..., 1534.3547 1535.3547 1536.3547 +VIEW == +wqkv-9 (view){64, 7, 4, 1} n=1536 0.398395 + -1.3851 -0.3851 0.6149..., 59.6149 60.6149 61.6149 + + +wqkv-9 (view){64, 7, 4, 1} n=64 0.398395 + -1.3851 -0.3851 0.6149..., 59.6149 60.6149 61.6149 +CONT == +Vcur-9{64, 7, 4, 1} n=64 0.398395 + -1.3851 -0.3851 0.6149..., 59.6149 60.6149 61.6149 + + +Vcur-9{64, 7, 4, 1} n=64 0.398395 + -1.3851 -0.3851 0.6149..., 59.6149 60.6149 61.6149 +REPEAT == +node_407{64, 7, 16, 1} n=64 0.398395 + -1.3851 -0.3851 0.6149..., 59.6149 60.6149 61.6149 + + +cache_v_l9{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-9{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l9{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-9{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-9{64, 16, 7, 1} n=64 -0.748526 + -0.0981 0.9019 1.9019..., 60.9019 61.9019 62.9019 +PERMUTE == +q-9{64, 7, 16, 1} n=64 -0.748526 + -0.0981 0.9019 1.9019..., 60.9019 61.9019 62.9019 + + +k-9{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-9{64, 7, 16, 1} n=64 -0.748526 + -0.0981 0.9019 1.9019..., 60.9019 61.9019 62.9019 +MUL_MAT == +kq-9{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-9{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-9{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-9{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-9{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-9{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-9{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-9{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-9{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-9{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.9.attn_output.weight{1024, 1280, 1, 1} n=1024 0.000465 + -0.0778 -0.0778 -0.0779..., -0.1552 -0.1553 -0.1554 +kqv_merged_cont-9{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-9{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-9{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 +l_out-8{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 +ADD == +node_417{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 + + +node_417{1280, 7, 1, 1} n=1280 -1430.849243 + -4.6232 -3.6232 -2.6232..., 1272.3768 1273.3768 1274.3768 +RMS_NORM == +norm-9{1280, 7, 1, 1} n=1280 -34.190929 + -0.1105 0.8895 1.8895..., 1276.8895 1277.8895 1278.8895 + + +norm-9{1280, 7, 1, 1} n=1280 -6.004551 + -0.0297 0.9703 1.9703..., 1276.9703 1277.9703 1278.9703 +blk.9.ffn_norm.weight{1280, 1, 1, 1} n=1280 539.591492 + 0.2690 1.2690 2.2690..., 1277.2690 1278.2690 1279.2690 +MUL == +ffn_norm-9{1280, 7, 1, 1} n=1280 -6.004551 + -0.0297 0.9703 1.9703..., 1276.9703 1277.9703 1278.9703 + + +ffn_norm-9{1280, 7, 1, 1} n=1280 -33.038673 + -0.1635 0.8365 1.8365..., 1276.8364 1277.8364 1278.8364 +RMS_NORM == +norm-9{1280, 7, 1, 1} n=1280 -33.038673 + -0.1635 0.8365 1.8365..., 1276.8364 1277.8364 1278.8364 + + +norm-9{1280, 7, 1, 1} n=1280 -5.751181 + -0.0440 0.9560 1.9560..., 1276.9561 1277.9561 1278.9561 +blk.9.ffn_norm.weight{1280, 1, 1, 1} n=1280 539.591492 + 0.2690 1.2690 2.2690..., 1277.2690 1278.2690 1279.2690 +MUL == +ffn_norm-9{1280, 7, 1, 1} n=1280 -5.751181 + -0.0440 0.9560 1.9560..., 1276.9561 1277.9561 1278.9561 + + +blk.9.ffn_up.weight{1280, 6656, 1, 1} n=1280 0.000000 + -0.0218 -0.0218 -0.0218..., -0.0513 -0.0513 -0.0514 +ffn_norm-9{1280, 7, 1, 1} n=1280 -5.751181 + -0.0440 0.9560 1.9560..., 1276.9561 1277.9561 1278.9561 +MUL_MAT == +ffn_silu-9{6656, 7, 1, 1} n=1280 -596.909119 + 0.0101 1.0101 2.0101..., 6653.0103 6654.0103 6655.0103 + + +ffn_silu-9{6656, 7, 1, 1} n=6656 -596.909119 + 0.0101 1.0101 2.0101..., 6653.0103 6654.0103 6655.0103 +VIEW == +ffn_up-9 (view){3328, 7, 1, 1} n=6656 -602.315552 + 0.0101 1.0101 2.0101..., 3325.0100 3326.0100 3327.0100 + + +ffn_up-9 (view){3328, 7, 1, 1} n=3328 -602.315552 + 0.0101 1.0101 2.0101..., 3325.0100 3326.0100 3327.0100 +CONT == +ffn_up-9 (view) (cont){3328, 7, 1, 1} n=3328 -602.315552 + 0.0101 1.0101 2.0101..., 3325.0100 3326.0100 3327.0100 + + +ffn_up-9 (view) (cont){3328, 7, 1, 1} n=3328 -221.604706 + 0.0051 1.0051 2.0051..., 3325.0051 3326.0051 3327.0051 +SILU == +node_425{3328, 7, 1, 1} n=3328 -221.604706 + 0.0051 1.0051 2.0051..., 3325.0051 3326.0051 3327.0051 + + +ffn_silu-9{6656, 7, 1, 1} n=6656 -596.909119 + 0.0101 1.0101 2.0101..., 6653.0103 6654.0103 6655.0103 +VIEW == +ffn_up-9 (view){3328, 7, 1, 1} n=6656 5.406466 + 0.0003 1.0003 2.0003..., 3325.0002 3326.0002 3327.0002 + + +ffn_up-9 (view){3328, 7, 1, 1} n=3328 5.406466 + 0.0003 1.0003 2.0003..., 3325.0002 3326.0002 3327.0002 +CONT == +ffn_up-9 (view) (cont){3328, 7, 1, 1} n=3328 5.406466 + 0.0003 1.0003 2.0003..., 3325.0002 3326.0002 3327.0002 + + +node_425{3328, 7, 1, 1} n=3328 0.462783 + 0.0000 1.0000 2.0000..., 3325.0000 3326.0000 3327.0000 +ffn_up-9 (view) (cont){3328, 7, 1, 1} n=3328 5.406466 + 0.0003 1.0003 2.0003..., 3325.0002 3326.0002 3327.0002 +MUL == +ffn_mul-9{3328, 7, 1, 1} n=3328 0.462783 + 0.0000 1.0000 2.0000..., 3325.0000 3326.0000 3327.0000 + + +blk.9.ffn_down.weight{3328, 1280, 1, 1} n=3328 -0.000053 + -0.0351 -0.0351 -0.0351..., -0.3423 -0.3425 -0.3428 +ffn_mul-9{3328, 7, 1, 1} n=3328 0.462783 + 0.0000 1.0000 2.0000..., 3325.0000 3326.0000 3327.0000 +MUL_MAT == +ffn_out-9{1280, 7, 1, 1} n=3328 -3.856762 + 0.0904 1.0904 2.0904..., 1277.0903 1278.0903 1279.0903 + + +node_417{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 +ffn_out-9{1280, 7, 1, 1} n=1280 -3.856762 + 0.0904 1.0904 2.0904..., 1277.0903 1278.0903 1279.0903 +ADD == +l_out-9{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 + + +l_out-9{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 +RMS_NORM == +norm-10{1280, 7, 1, 1} n=1280 -34.291115 + -0.1083 0.8917 1.8917..., 1276.8916 1277.8916 1278.8916 + + +norm-10{1280, 7, 1, 1} n=1280 -6.206045 + -0.1069 0.8931 1.8931..., 1276.8931 1277.8931 1278.8931 +blk.10.attn_norm.weight{1280, 1, 1, 1} n=1280 653.635620 + 0.9870 1.9870 2.9870..., 1277.9871 1278.9871 1279.9871 +MUL == +attn_norm-10{1280, 7, 1, 1} n=1280 -6.206045 + -0.1069 0.8931 1.8931..., 1276.8931 1277.8931 1278.8931 + + +blk.10.attn_qkv.weight{1280, 1536, 1, 1} n=1280 -0.000001 + -0.0013 -0.0013 -0.0013..., -0.0030 -0.0030 -0.0030 +attn_norm-10{1280, 7, 1, 1} n=1280 -6.206045 + -0.1069 0.8931 1.8931..., 1276.8931 1277.8931 1278.8931 +MUL_MAT == +wqkv-10{1536, 7, 1, 1} n=1280 9.779473 + -1.5060 -0.5060 0.4940..., 1531.4939 1532.4939 1533.4939 + + +wqkv-10{1536, 7, 1, 1} n=1536 9.779473 + -1.5060 -0.5060 0.4940..., 1531.4939 1532.4939 1533.4939 +VIEW == +wqkv-10 (view){64, 7, 16, 1} n=1536 -2.900037 + -1.5060 -0.5060 0.4940..., 59.4940 60.4940 61.4940 + + +wqkv-10 (view){64, 7, 16, 1} n=64 -2.900037 + -1.5060 -0.5060 0.4940..., 59.4940 60.4940 61.4940 +CONT == +Qcur-10{64, 7, 16, 1} n=64 -2.900037 + -1.5060 -0.5060 0.4940..., 59.4940 60.4940 61.4940 + + +Qcur-10{64, 7, 16, 1} n=64 -6.142516 + -3.1899 -2.1899 -1.1899..., 57.8101 58.8101 59.8101 +RMS_NORM == +norm-10{64, 7, 16, 1} n=64 -6.142516 + -3.1899 -2.1899 -1.1899..., 57.8101 58.8101 59.8101 + + +norm-10{64, 7, 16, 1} n=64 4.116167 + -0.1414 0.8586 1.8586..., 60.8586 61.8586 62.8586 +blk.10.attn_q_norm.weight{64, 1, 1, 1} n=64 120.929543 + 0.0443 1.0443 2.0443..., 61.0443 62.0443 63.0443 +MUL == +Qcur-10{64, 7, 16, 1} n=64 4.116167 + -0.1414 0.8586 1.8586..., 60.8586 61.8586 62.8586 + + +Qcur-10{64, 7, 16, 1} n=64 4.116167 + -0.1414 0.8586 1.8586..., 60.8586 61.8586 62.8586 +RESHAPE == +Qcur-10 (reshaped){64, 16, 7, 1} n=64 4.116167 + -0.1414 0.8586 1.8586..., 60.8586 61.8586 62.8586 + + +Qcur-10 (reshaped){64, 16, 7, 1} n=64 4.116167 + -0.1414 0.8586 1.8586..., 60.8586 61.8586 62.8586 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-10{64, 16, 7, 1} n=64 4.116167 + -0.1414 0.8586 1.8586..., 60.8586 61.8586 62.8586 + + +Qcur-10{64, 16, 7, 1} n=64 0.514521 + -0.0177 0.9823 1.9823..., 60.9823 61.9823 62.9823 +SCALE == +Qcur-10{64, 16, 7, 1} n=64 0.514521 + -0.0177 0.9823 1.9823..., 60.9823 61.9823 62.9823 + + +wqkv-10{1536, 7, 1, 1} n=1536 9.779473 + -1.5060 -0.5060 0.4940..., 1531.4939 1532.4939 1533.4939 +VIEW == +wqkv-10 (view){64, 7, 4, 1} n=1536 1.923896 + 0.1892 1.1892 2.1892..., 61.1892 62.1892 63.1892 + + +wqkv-10 (view){64, 7, 4, 1} n=64 1.923896 + 0.1892 1.1892 2.1892..., 61.1892 62.1892 63.1892 +CONT == +Kcur-10{64, 7, 4, 1} n=64 1.923896 + 0.1892 1.1892 2.1892..., 61.1892 62.1892 63.1892 + + +Kcur-10{64, 7, 4, 1} n=64 4.982111 + 0.4899 1.4899 2.4899..., 61.4899 62.4899 63.4899 +RMS_NORM == +norm-10{64, 7, 4, 1} n=64 4.982111 + 0.4899 1.4899 2.4899..., 61.4899 62.4899 63.4899 + + +norm-10{64, 7, 4, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 +blk.10.attn_k_norm.weight{64, 1, 1, 1} n=64 121.737457 + 0.0050 1.0050 2.0050..., 61.0050 62.0050 63.0050 +MUL == +Kcur-10{64, 7, 4, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +Kcur-10{64, 7, 4, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 +RESHAPE == +Kcur-10 (reshaped){64, 4, 7, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +Kcur-10 (reshaped){64, 4, 7, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_446{64, 4, 7, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +node_446{64, 4, 7, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 +REPEAT == +node_447{64, 4, 28, 1} n=64 -0.401568 + 0.0024 1.0024 2.0024..., 61.0024 62.0024 63.0024 + + +wqkv-10{1536, 7, 1, 1} n=1536 9.779473 + -1.5060 -0.5060 0.4940..., 1531.4939 1532.4939 1533.4939 +VIEW == +wqkv-10 (view){64, 7, 4, 1} n=1536 1.359704 + 0.0501 1.0501 2.0501..., 61.0501 62.0501 63.0501 + + +wqkv-10 (view){64, 7, 4, 1} n=64 1.359704 + 0.0501 1.0501 2.0501..., 61.0501 62.0501 63.0501 +CONT == +Vcur-10{64, 7, 4, 1} n=64 1.359704 + 0.0501 1.0501 2.0501..., 61.0501 62.0501 63.0501 + + +Vcur-10{64, 7, 4, 1} n=64 1.359704 + 0.0501 1.0501 2.0501..., 61.0501 62.0501 63.0501 +REPEAT == +node_450{64, 7, 16, 1} n=64 1.359704 + 0.0501 1.0501 2.0501..., 61.0501 62.0501 63.0501 + + +cache_v_l10{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-10{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l10{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-10{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-10{64, 16, 7, 1} n=64 0.514521 + -0.0177 0.9823 1.9823..., 60.9823 61.9823 62.9823 +PERMUTE == +q-10{64, 7, 16, 1} n=64 0.514521 + -0.0177 0.9823 1.9823..., 60.9823 61.9823 62.9823 + + +k-10{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-10{64, 7, 16, 1} n=64 0.514521 + -0.0177 0.9823 1.9823..., 60.9823 61.9823 62.9823 +MUL_MAT == +kq-10{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-10{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-10{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-10{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-10{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-10{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-10{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-10{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-10{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-10{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.10.attn_output.weight{1024, 1280, 1, 1} n=1024 -0.000000 + 0.0238 0.0238 0.0238..., 0.0474 0.0475 0.0475 +kqv_merged_cont-10{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-10{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-10{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 +l_out-9{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 +ADD == +node_460{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 + + +node_460{1280, 7, 1, 1} n=1280 -1434.705078 + -4.5328 -3.5328 -2.5328..., 1272.4672 1273.4672 1274.4672 +RMS_NORM == +norm-10{1280, 7, 1, 1} n=1280 -34.291115 + -0.1083 0.8917 1.8917..., 1276.8916 1277.8916 1278.8916 + + +norm-10{1280, 7, 1, 1} n=1280 -8.715087 + -0.0298 0.9702 1.9702..., 1276.9702 1277.9702 1278.9702 +blk.10.ffn_norm.weight{1280, 1, 1, 1} n=1280 590.816284 + 0.2753 1.2753 2.2753..., 1277.2753 1278.2753 1279.2753 +MUL == +ffn_norm-10{1280, 7, 1, 1} n=1280 -8.715087 + -0.0298 0.9702 1.9702..., 1276.9702 1277.9702 1278.9702 + + +ffn_norm-10{1280, 7, 1, 1} n=1280 -33.325222 + -0.1141 0.8859 1.8859..., 1276.8860 1277.8860 1278.8860 +RMS_NORM == +norm-10{1280, 7, 1, 1} n=1280 -33.325222 + -0.1141 0.8859 1.8859..., 1276.8860 1277.8860 1278.8860 + + +norm-10{1280, 7, 1, 1} n=1280 -8.452316 + -0.0314 0.9686 1.9686..., 1276.9686 1277.9686 1278.9686 +blk.10.ffn_norm.weight{1280, 1, 1, 1} n=1280 590.816284 + 0.2753 1.2753 2.2753..., 1277.2753 1278.2753 1279.2753 +MUL == +ffn_norm-10{1280, 7, 1, 1} n=1280 -8.452316 + -0.0314 0.9686 1.9686..., 1276.9686 1277.9686 1278.9686 + + +blk.10.ffn_up.weight{1280, 7168, 1, 1} n=1280 0.000000 + 0.0598 0.0598 0.0598..., 0.1449 0.1450 0.1451 +ffn_norm-10{1280, 7, 1, 1} n=1280 -8.452316 + -0.0314 0.9686 1.9686..., 1276.9686 1277.9686 1278.9686 +MUL_MAT == +ffn_silu-10{7168, 7, 1, 1} n=1280 -904.412903 + 0.2033 1.2033 2.2033..., 7165.2031 7166.2031 7167.2031 + + +ffn_silu-10{7168, 7, 1, 1} n=7168 -904.412903 + 0.2033 1.2033 2.2033..., 7165.2031 7166.2031 7167.2031 +VIEW == +ffn_up-10 (view){3584, 7, 1, 1} n=7168 -903.663696 + 0.2033 1.2033 2.2033..., 3581.2034 3582.2034 3583.2034 + + +ffn_up-10 (view){3584, 7, 1, 1} n=3584 -903.663696 + 0.2033 1.2033 2.2033..., 3581.2034 3582.2034 3583.2034 +CONT == +ffn_up-10 (view) (cont){3584, 7, 1, 1} n=3584 -903.663696 + 0.2033 1.2033 2.2033..., 3581.2034 3582.2034 3583.2034 + + +ffn_up-10 (view) (cont){3584, 7, 1, 1} n=3584 -289.262634 + 0.1119 1.1119 2.1119..., 3581.1118 3582.1118 3583.1118 +SILU == +node_468{3584, 7, 1, 1} n=3584 -289.262634 + 0.1119 1.1119 2.1119..., 3581.1118 3582.1118 3583.1118 + + +ffn_silu-10{7168, 7, 1, 1} n=7168 -904.412903 + 0.2033 1.2033 2.2033..., 7165.2031 7166.2031 7167.2031 +VIEW == +ffn_up-10 (view){3584, 7, 1, 1} n=7168 -0.747298 + 0.1521 1.1521 2.1521..., 3581.1521 3582.1521 3583.1521 + + +ffn_up-10 (view){3584, 7, 1, 1} n=3584 -0.747298 + 0.1521 1.1521 2.1521..., 3581.1521 3582.1521 3583.1521 +CONT == +ffn_up-10 (view) (cont){3584, 7, 1, 1} n=3584 -0.747298 + 0.1521 1.1521 2.1521..., 3581.1521 3582.1521 3583.1521 + + +node_468{3584, 7, 1, 1} n=3584 0.966810 + 0.0170 1.0170 2.0170..., 3581.0171 3582.0171 3583.0171 +ffn_up-10 (view) (cont){3584, 7, 1, 1} n=3584 -0.747298 + 0.1521 1.1521 2.1521..., 3581.1521 3582.1521 3583.1521 +MUL == +ffn_mul-10{3584, 7, 1, 1} n=3584 0.966810 + 0.0170 1.0170 2.0170..., 3581.0171 3582.0171 3583.0171 + + +blk.10.ffn_down.weight{3584, 1280, 1, 1} n=3584 0.000001 + -0.0739 -0.0740 -0.0740..., -0.8398 -0.8403 -0.8408 +ffn_mul-10{3584, 7, 1, 1} n=3584 0.966810 + 0.0170 1.0170 2.0170..., 3581.0171 3582.0171 3583.0171 +MUL_MAT == +ffn_out-10{1280, 7, 1, 1} n=3584 -7.708157 + -0.0212 0.9788 1.9788..., 1276.9788 1277.9788 1278.9788 + + +node_460{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 +ffn_out-10{1280, 7, 1, 1} n=1280 -7.708157 + -0.0212 0.9788 1.9788..., 1276.9788 1277.9788 1278.9788 +ADD == +l_out-10{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 + + +l_out-10{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 +RMS_NORM == +norm-11{1280, 7, 1, 1} n=1280 -34.475323 + -0.1088 0.8912 1.8912..., 1276.8911 1277.8911 1278.8911 + + +norm-11{1280, 7, 1, 1} n=1280 -7.591217 + -0.1388 0.8612 1.8612..., 1276.8612 1277.8612 1278.8612 +blk.11.attn_norm.weight{1280, 1, 1, 1} n=1280 934.029053 + 1.2751 2.2751 3.2751..., 1278.2751 1279.2751 1280.2751 +MUL == +attn_norm-11{1280, 7, 1, 1} n=1280 -7.591217 + -0.1388 0.8612 1.8612..., 1276.8612 1277.8612 1278.8612 + + +blk.11.attn_qkv.weight{1280, 1536, 1, 1} n=1280 0.000002 + 0.0773 0.0773 0.0774..., 0.1854 0.1855 0.1857 +attn_norm-11{1280, 7, 1, 1} n=1280 -7.591217 + -0.1388 0.8612 1.8612..., 1276.8612 1277.8612 1278.8612 +MUL_MAT == +wqkv-11{1536, 7, 1, 1} n=1280 -26.190250 + -1.3812 -0.3812 0.6188..., 1531.6189 1532.6189 1533.6189 + + +wqkv-11{1536, 7, 1, 1} n=1536 -26.190250 + -1.3812 -0.3812 0.6188..., 1531.6189 1532.6189 1533.6189 +VIEW == +wqkv-11 (view){64, 7, 16, 1} n=1536 1.245711 + -1.3812 -0.3812 0.6188..., 59.6188 60.6188 61.6188 + + +wqkv-11 (view){64, 7, 16, 1} n=64 1.245711 + -1.3812 -0.3812 0.6188..., 59.6188 60.6188 61.6188 +CONT == +Qcur-11{64, 7, 16, 1} n=64 1.245711 + -1.3812 -0.3812 0.6188..., 59.6188 60.6188 61.6188 + + +Qcur-11{64, 7, 16, 1} n=64 2.498707 + -2.7704 -1.7704 -0.7704..., 58.2296 59.2296 60.2296 +RMS_NORM == +norm-11{64, 7, 16, 1} n=64 2.498707 + -2.7704 -1.7704 -0.7704..., 58.2296 59.2296 60.2296 + + +norm-11{64, 7, 16, 1} n=64 0.335676 + -0.5641 0.4359 1.4359..., 60.4359 61.4359 62.4359 +blk.11.attn_q_norm.weight{64, 1, 1, 1} n=64 117.868645 + 0.2036 1.2036 2.2036..., 61.2036 62.2036 63.2036 +MUL == +Qcur-11{64, 7, 16, 1} n=64 0.335676 + -0.5641 0.4359 1.4359..., 60.4359 61.4359 62.4359 + + +Qcur-11{64, 7, 16, 1} n=64 0.335676 + -0.5641 0.4359 1.4359..., 60.4359 61.4359 62.4359 +RESHAPE == +Qcur-11 (reshaped){64, 16, 7, 1} n=64 0.335676 + -0.5641 0.4359 1.4359..., 60.4359 61.4359 62.4359 + + +Qcur-11 (reshaped){64, 16, 7, 1} n=64 0.335676 + -0.5641 0.4359 1.4359..., 60.4359 61.4359 62.4359 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-11{64, 16, 7, 1} n=64 0.335676 + -0.5641 0.4359 1.4359..., 60.4359 61.4359 62.4359 + + +Qcur-11{64, 16, 7, 1} n=64 0.041960 + -0.0705 0.9295 1.9295..., 60.9295 61.9295 62.9295 +SCALE == +Qcur-11{64, 16, 7, 1} n=64 0.041960 + -0.0705 0.9295 1.9295..., 60.9295 61.9295 62.9295 + + +wqkv-11{1536, 7, 1, 1} n=1536 -26.190250 + -1.3812 -0.3812 0.6188..., 1531.6189 1532.6189 1533.6189 +VIEW == +wqkv-11 (view){64, 7, 4, 1} n=1536 0.438913 + -0.0768 0.9232 1.9232..., 60.9232 61.9232 62.9232 + + +wqkv-11 (view){64, 7, 4, 1} n=64 0.438913 + -0.0768 0.9232 1.9232..., 60.9232 61.9232 62.9232 +CONT == +Kcur-11{64, 7, 4, 1} n=64 0.438913 + -0.0768 0.9232 1.9232..., 60.9232 61.9232 62.9232 + + +Kcur-11{64, 7, 4, 1} n=64 0.832667 + -0.1457 0.8543 1.8543..., 60.8543 61.8543 62.8543 +RMS_NORM == +norm-11{64, 7, 4, 1} n=64 0.832667 + -0.1457 0.8543 1.8543..., 60.8543 61.8543 62.8543 + + +norm-11{64, 7, 4, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 +blk.11.attn_k_norm.weight{64, 1, 1, 1} n=64 114.415764 + -0.1091 0.8909 1.8909..., 60.8909 61.8909 62.8909 +MUL == +Kcur-11{64, 7, 4, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 + + +Kcur-11{64, 7, 4, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 +RESHAPE == +Kcur-11 (reshaped){64, 4, 7, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 + + +Kcur-11 (reshaped){64, 4, 7, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_489{64, 4, 7, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 + + +node_489{64, 4, 7, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 +REPEAT == +node_490{64, 4, 28, 1} n=64 -1.276385 + 0.0159 1.0159 2.0159..., 61.0159 62.0159 63.0159 + + +wqkv-11{1536, 7, 1, 1} n=1536 -26.190250 + -1.3812 -0.3812 0.6188..., 1531.6189 1532.6189 1533.6189 +VIEW == +wqkv-11 (view){64, 7, 4, 1} n=1536 1.318665 + -0.6150 0.3850 1.3850..., 60.3850 61.3850 62.3850 + + +wqkv-11 (view){64, 7, 4, 1} n=64 1.318665 + -0.6150 0.3850 1.3850..., 60.3850 61.3850 62.3850 +CONT == +Vcur-11{64, 7, 4, 1} n=64 1.318665 + -0.6150 0.3850 1.3850..., 60.3850 61.3850 62.3850 + + +Vcur-11{64, 7, 4, 1} n=64 1.318665 + -0.6150 0.3850 1.3850..., 60.3850 61.3850 62.3850 +REPEAT == +node_493{64, 7, 16, 1} n=64 1.318665 + -0.6150 0.3850 1.3850..., 60.3850 61.3850 62.3850 + + +cache_v_l11{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-11{32, 64, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l11{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-11{64, 32, 8, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-11{64, 16, 7, 1} n=64 0.041960 + -0.0705 0.9295 1.9295..., 60.9295 61.9295 62.9295 +PERMUTE == +q-11{64, 7, 16, 1} n=64 0.041960 + -0.0705 0.9295 1.9295..., 60.9295 61.9295 62.9295 + + +k-11{64, 32, 8, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-11{64, 7, 16, 1} n=64 0.041960 + -0.0705 0.9295 1.9295..., 60.9295 61.9295 62.9295 +MUL_MAT == +kq-11{32, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-11{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-11{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-11{32, 64, 8, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-11{32, 7, 16, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-11{64, 7, 16, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-11{64, 7, 16, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-11{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-11{64, 16, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-11{1024, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 + + +blk.11.attn_output.weight{1024, 1280, 1, 1} n=1024 0.000003 + -0.0108 -0.0108 -0.0108..., -0.0215 -0.0215 -0.0215 +kqv_merged_cont-11{1024, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1021.0000 1022.0000 1023.0000 +MUL_MAT == +kqv_out-11{1280, 7, 1, 1} n=1024 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-11{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 +l_out-10{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 +ADD == +node_503{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 + + +node_503{1280, 7, 1, 1} n=1280 -1442.413330 + -4.5540 -3.5540 -2.5540..., 1272.4460 1273.4460 1274.4460 +RMS_NORM == +norm-11{1280, 7, 1, 1} n=1280 -34.475323 + -0.1088 0.8912 1.8912..., 1276.8911 1277.8911 1278.8911 + + +norm-11{1280, 7, 1, 1} n=1280 -11.526356 + -0.0291 0.9709 1.9709..., 1276.9708 1277.9708 1278.9708 +blk.11.ffn_norm.weight{1280, 1, 1, 1} n=1280 655.803589 + 0.2678 1.2678 2.2678..., 1277.2678 1278.2678 1279.2678 +MUL == +ffn_norm-11{1280, 7, 1, 1} n=1280 -11.526356 + -0.0291 0.9709 1.9709..., 1276.9708 1277.9708 1278.9708 + + +ffn_norm-11{1280, 7, 1, 1} n=1280 -33.819817 + -0.0855 0.9145 1.9145..., 1276.9144 1277.9144 1278.9144 +RMS_NORM == +norm-11{1280, 7, 1, 1} n=1280 -33.819817 + -0.0855 0.9145 1.9145..., 1276.9144 1277.9144 1278.9144 + + +norm-11{1280, 7, 1, 1} n=1280 -11.314798 + -0.0229 0.9771 1.9771..., 1276.9771 1277.9771 1278.9771 +blk.11.ffn_norm.weight{1280, 1, 1, 1} n=1280 655.803589 + 0.2678 1.2678 2.2678..., 1277.2678 1278.2678 1279.2678 +MUL == +ffn_norm-11{1280, 7, 1, 1} n=1280 -11.314798 + -0.0229 0.9771 1.9771..., 1276.9771 1277.9771 1278.9771 + + +blk.11.ffn_up.weight{1280, 7680, 1, 1} n=1280 0.000000 + -0.0352 -0.0352 -0.0353..., -0.0859 -0.0859 -0.0860 +ffn_norm-11{1280, 7, 1, 1} n=1280 -11.314798 + -0.0229 0.9771 1.9771..., 1276.9771 1277.9771 1278.9771 +MUL_MAT == +ffn_silu-11{7680, 7, 1, 1} n=1280 -1405.568726 + -0.1891 0.8109 1.8109..., 7676.8110 7677.8110 7678.8110 + + +ffn_silu-11{7680, 7, 1, 1} n=7680 -1405.568726 + -0.1891 0.8109 1.8109..., 7676.8110 7677.8110 7678.8110 +VIEW == +ffn_up-11 (view){3840, 7, 1, 1} n=7680 -1381.969971 + -0.1891 0.8109 1.8109..., 3836.8108 3837.8108 3838.8108 + + +ffn_up-11 (view){3840, 7, 1, 1} n=3840 -1381.969971 + -0.1891 0.8109 1.8109..., 3836.8108 3837.8108 3838.8108 +CONT == +ffn_up-11 (view) (cont){3840, 7, 1, 1} n=3840 -1381.969971 + -0.1891 0.8109 1.8109..., 3836.8108 3837.8108 3838.8108 + + +ffn_up-11 (view) (cont){3840, 7, 1, 1} n=3840 -369.912415 + -0.0856 0.9144 1.9144..., 3836.9143 3837.9143 3838.9143 +SILU == +node_511{3840, 7, 1, 1} n=3840 -369.912415 + -0.0856 0.9144 1.9144..., 3836.9143 3837.9143 3838.9143 + + +ffn_silu-11{7680, 7, 1, 1} n=7680 -1405.568726 + -0.1891 0.8109 1.8109..., 7676.8110 7677.8110 7678.8110 +VIEW == +ffn_up-11 (view){3840, 7, 1, 1} n=7680 -23.596827 + 0.0540 1.0540 2.0540..., 3837.0540 3838.0540 3839.0540 + + +ffn_up-11 (view){3840, 7, 1, 1} n=3840 -23.596827 + 0.0540 1.0540 2.0540..., 3837.0540 3838.0540 3839.0540 +CONT == +ffn_up-11 (view) (cont){3840, 7, 1, 1} n=3840 -23.596827 + 0.0540 1.0540 2.0540..., 3837.0540 3838.0540 3839.0540 + + +node_511{3840, 7, 1, 1} n=3840 2.485716 + -0.0046 0.9954 1.9954..., 3836.9954 3837.9954 3838.9954 +ffn_up-11 (view) (cont){3840, 7, 1, 1} n=3840 -23.596827 + 0.0540 1.0540 2.0540..., 3837.0540 3838.0540 3839.0540 +MUL == +ffn_mul-11{3840, 7, 1, 1} n=3840 2.485716 + -0.0046 0.9954 1.9954..., 3836.9954 3837.9954 3838.9954 + + +blk.11.ffn_down.weight{3840, 1280, 1, 1} n=3840 0.000000 + 0.0787 0.0787 0.0788..., 1.0059 1.0068 1.0078 +ffn_mul-11{3840, 7, 1, 1} n=3840 2.485716 + -0.0046 0.9954 1.9954..., 3836.9954 3837.9954 3838.9954 +MUL_MAT == +ffn_out-11{1280, 7, 1, 1} n=3840 -1.108831 + 0.0770 1.0770 2.0770..., 1277.0770 1278.0770 1279.0770 + + +node_503{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 +ffn_out-11{1280, 7, 1, 1} n=1280 -1.108831 + 0.0770 1.0770 2.0770..., 1277.0770 1278.0770 1279.0770 +ADD == +l_out-11{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 + + +l_out-11{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 +RMS_NORM == +norm-12{1280, 7, 1, 1} n=1280 -34.472301 + -0.1069 0.8931 1.8931..., 1276.8931 1277.8931 1278.8931 + + +norm-12{1280, 7, 1, 1} n=1280 -10.226229 + -0.1677 0.8323 1.8323..., 1276.8324 1277.8324 1278.8324 +blk.12.attn_norm.weight{1280, 1, 1, 1} n=1280 1176.609619 + 1.5682 2.5682 3.5682..., 1278.5682 1279.5682 1280.5682 +MUL == +attn_norm-12{1280, 7, 1, 1} n=1280 -10.226229 + -0.1677 0.8323 1.8323..., 1276.8324 1277.8324 1278.8324 + + +blk.12.attn_qkv.weight{1280, 1920, 1, 1} n=1280 0.000000 + 0.0740 0.0740 0.0741..., 0.1788 0.1790 0.1791 +attn_norm-12{1280, 7, 1, 1} n=1280 -10.226229 + -0.1677 0.8323 1.8323..., 1276.8324 1277.8324 1278.8324 +MUL_MAT == +wqkv-12{1920, 7, 1, 1} n=1280 62.353695 + -3.0188 -2.0188 -1.0188..., 1913.9812 1914.9812 1915.9812 + + +wqkv-12{1920, 7, 1, 1} n=1920 62.353695 + -3.0188 -2.0188 -1.0188..., 1913.9812 1914.9812 1915.9812 +VIEW == +wqkv-12 (view){64, 7, 20, 1} n=1920 13.801246 + -3.0188 -2.0188 -1.0188..., 57.9812 58.9812 59.9812 + + +wqkv-12 (view){64, 7, 20, 1} n=64 13.801246 + -3.0188 -2.0188 -1.0188..., 57.9812 58.9812 59.9812 +CONT == +Qcur-12{64, 7, 20, 1} n=64 13.801246 + -3.0188 -2.0188 -1.0188..., 57.9812 58.9812 59.9812 + + +Qcur-12{64, 7, 20, 1} n=64 11.843224 + -2.5905 -1.5905 -0.5905..., 58.4095 59.4095 60.4095 +RMS_NORM == +norm-12{64, 7, 20, 1} n=64 11.843224 + -2.5905 -1.5905 -0.5905..., 58.4095 59.4095 60.4095 + + +norm-12{64, 7, 20, 1} n=64 8.795938 + -0.2148 0.7852 1.7852..., 60.7852 61.7852 62.7852 +blk.12.attn_q_norm.weight{64, 1, 1, 1} n=64 109.524849 + 0.0829 1.0829 2.0829..., 61.0829 62.0829 63.0829 +MUL == +Qcur-12{64, 7, 20, 1} n=64 8.795938 + -0.2148 0.7852 1.7852..., 60.7852 61.7852 62.7852 + + +Qcur-12{64, 7, 20, 1} n=64 8.795938 + -0.2148 0.7852 1.7852..., 60.7852 61.7852 62.7852 +RESHAPE == +Qcur-12 (reshaped){64, 20, 7, 1} n=64 8.795938 + -0.2148 0.7852 1.7852..., 60.7852 61.7852 62.7852 + + +Qcur-12 (reshaped){64, 20, 7, 1} n=64 8.795938 + -0.2148 0.7852 1.7852..., 60.7852 61.7852 62.7852 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-12{64, 20, 7, 1} n=64 8.795938 + -0.2148 0.7852 1.7852..., 60.7852 61.7852 62.7852 + + +Qcur-12{64, 20, 7, 1} n=64 1.099492 + -0.0269 0.9731 1.9731..., 60.9731 61.9731 62.9731 +SCALE == +Qcur-12{64, 20, 7, 1} n=64 1.099492 + -0.0269 0.9731 1.9731..., 60.9731 61.9731 62.9731 + + +wqkv-12{1920, 7, 1, 1} n=1920 62.353695 + -3.0188 -2.0188 -1.0188..., 1913.9812 1914.9812 1915.9812 +VIEW == +wqkv-12 (view){64, 7, 5, 1} n=1920 9.026163 + 3.5602 4.5602 5.5602..., 64.5602 65.5602 66.5602 + + +wqkv-12 (view){64, 7, 5, 1} n=64 9.026163 + 3.5602 4.5602 5.5602..., 64.5602 65.5602 66.5602 +CONT == +Kcur-12{64, 7, 5, 1} n=64 9.026163 + 3.5602 4.5602 5.5602..., 64.5602 65.5602 66.5602 + + +Kcur-12{64, 7, 5, 1} n=64 9.632412 + 3.7993 4.7993 5.7993..., 64.7993 65.7993 66.7993 +RMS_NORM == +norm-12{64, 7, 5, 1} n=64 9.632412 + 3.7993 4.7993 5.7993..., 64.7993 65.7993 66.7993 + + +norm-12{64, 7, 5, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 +blk.12.attn_k_norm.weight{64, 1, 1, 1} n=64 106.967125 + 0.0350 1.0350 2.0350..., 61.0350 62.0350 63.0350 +MUL == +Kcur-12{64, 7, 5, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 + + +Kcur-12{64, 7, 5, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 +RESHAPE == +Kcur-12 (reshaped){64, 5, 7, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 + + +Kcur-12 (reshaped){64, 5, 7, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_532{64, 5, 7, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 + + +node_532{64, 5, 7, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 +REPEAT == +node_533{64, 5, 28, 1} n=64 19.693867 + 0.1329 1.1329 2.1329..., 61.1329 62.1329 63.1329 + + +wqkv-12{1920, 7, 1, 1} n=1920 62.353695 + -3.0188 -2.0188 -1.0188..., 1913.9812 1914.9812 1915.9812 +VIEW == +wqkv-12 (view){64, 7, 5, 1} n=1920 -488820392866834739960183995236352.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-12 (view){64, 7, 5, 1} n=64 -488820392866834739960183995236352.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-12{64, 7, 5, 1} n=64 -488820392866834739960183995236352.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-12{64, 7, 5, 1} n=64 -488820392866834739960183995236352.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_536{64, 7, 20, 1} n=64 -488820392866834739960183995236352.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l12{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-12{32, 64, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l12{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-12{64, 32, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-12{64, 20, 7, 1} n=64 1.099492 + -0.0269 0.9731 1.9731..., 60.9731 61.9731 62.9731 +PERMUTE == +q-12{64, 7, 20, 1} n=64 1.099492 + -0.0269 0.9731 1.9731..., 60.9731 61.9731 62.9731 + + +k-12{64, 32, 10, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-12{64, 7, 20, 1} n=64 1.099492 + -0.0269 0.9731 1.9731..., 60.9731 61.9731 62.9731 +MUL_MAT == +kq-12{32, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-12{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-12{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-12{32, 64, 10, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-12{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-12{64, 7, 20, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-12{64, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-12{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-12{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-12{1280, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +blk.12.attn_output.weight{1280, 1280, 1, 1} n=1280 -0.000003 + 0.0634 0.0635 0.0635..., 0.1577 0.1578 0.1580 +kqv_merged_cont-12{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +MUL_MAT == +kqv_out-12{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-12{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 +l_out-11{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 +ADD == +node_546{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 + + +node_546{1280, 7, 1, 1} n=1280 -1443.521240 + -4.4770 -3.4770 -2.4770..., 1272.5231 1273.5231 1274.5231 +RMS_NORM == +norm-12{1280, 7, 1, 1} n=1280 -34.472301 + -0.1069 0.8931 1.8931..., 1276.8931 1277.8931 1278.8931 + + +norm-12{1280, 7, 1, 1} n=1280 -12.950638 + -0.0391 0.9609 1.9609..., 1276.9609 1277.9609 1278.9609 +blk.12.ffn_norm.weight{1280, 1, 1, 1} n=1280 831.064819 + 0.3658 1.3658 2.3658..., 1277.3658 1278.3658 1279.3658 +MUL == +ffn_norm-12{1280, 7, 1, 1} n=1280 -12.950638 + -0.0391 0.9609 1.9609..., 1276.9609 1277.9609 1278.9609 + + +ffn_norm-12{1280, 7, 1, 1} n=1280 -33.857155 + -0.1022 0.8978 1.8978..., 1276.8977 1277.8977 1278.8977 +RMS_NORM == +norm-12{1280, 7, 1, 1} n=1280 -33.857155 + -0.1022 0.8978 1.8978..., 1276.8977 1277.8977 1278.8977 + + +norm-12{1280, 7, 1, 1} n=1280 -12.821001 + -0.0374 0.9626 1.9626..., 1276.9626 1277.9626 1278.9626 +blk.12.ffn_norm.weight{1280, 1, 1, 1} n=1280 831.064819 + 0.3658 1.3658 2.3658..., 1277.3658 1278.3658 1279.3658 +MUL == +ffn_norm-12{1280, 7, 1, 1} n=1280 -12.821001 + -0.0374 0.9626 1.9626..., 1276.9626 1277.9626 1278.9626 + + +blk.12.ffn_up.weight{1280, 8704, 1, 1} n=1280 0.000000 + -0.0407 -0.0407 -0.0408..., -0.0969 -0.0969 -0.0970 +ffn_norm-12{1280, 7, 1, 1} n=1280 -12.821001 + -0.0374 0.9626 1.9626..., 1276.9626 1277.9626 1278.9626 +MUL_MAT == +ffn_silu-12{8704, 7, 1, 1} n=1280 -1166.662354 + -0.4539 0.5461 1.5461..., 8700.5459 8701.5459 8702.5459 + + +ffn_silu-12{8704, 7, 1, 1} n=8704 -1166.662354 + -0.4539 0.5461 1.5461..., 8700.5459 8701.5459 8702.5459 +VIEW == +ffn_up-12 (view){4352, 7, 1, 1} n=8704 -1160.652710 + -0.4539 0.5461 1.5461..., 4348.5459 4349.5459 4350.5459 + + +ffn_up-12 (view){4352, 7, 1, 1} n=4352 -1160.652710 + -0.4539 0.5461 1.5461..., 4348.5459 4349.5459 4350.5459 +CONT == +ffn_up-12 (view) (cont){4352, 7, 1, 1} n=4352 -1160.652710 + -0.4539 0.5461 1.5461..., 4348.5459 4349.5459 4350.5459 + + +ffn_up-12 (view) (cont){4352, 7, 1, 1} n=4352 -177.249023 + -0.1763 0.8237 1.8237..., 4348.8237 4349.8237 4350.8237 +SILU == +node_554{4352, 7, 1, 1} n=4352 -177.249023 + -0.1763 0.8237 1.8237..., 4348.8237 4349.8237 4350.8237 + + +ffn_silu-12{8704, 7, 1, 1} n=8704 -1166.662354 + -0.4539 0.5461 1.5461..., 8700.5459 8701.5459 8702.5459 +VIEW == +ffn_up-12 (view){4352, 7, 1, 1} n=8704 -6.007252 + 0.2662 1.2662 2.2662..., 4349.2661 4350.2661 4351.2661 + + +ffn_up-12 (view){4352, 7, 1, 1} n=4352 -6.007252 + 0.2662 1.2662 2.2662..., 4349.2661 4350.2661 4351.2661 +CONT == +ffn_up-12 (view) (cont){4352, 7, 1, 1} n=4352 -6.007252 + 0.2662 1.2662 2.2662..., 4349.2661 4350.2661 4351.2661 + + +node_554{4352, 7, 1, 1} n=4352 -6.169259 + -0.0469 0.9531 1.9531..., 4348.9531 4349.9531 4350.9531 +ffn_up-12 (view) (cont){4352, 7, 1, 1} n=4352 -6.007252 + 0.2662 1.2662 2.2662..., 4349.2661 4350.2661 4351.2661 +MUL == +ffn_mul-12{4352, 7, 1, 1} n=4352 -6.169259 + -0.0469 0.9531 1.9531..., 4348.9531 4349.9531 4350.9531 + + +blk.12.ffn_down.weight{4352, 1280, 1, 1} n=4352 0.000003 + -0.0699 -0.0700 -0.0701..., -1.3662 -1.3672 -1.3682 +ffn_mul-12{4352, 7, 1, 1} n=4352 -6.169259 + -0.0469 0.9531 1.9531..., 4348.9531 4349.9531 4350.9531 +MUL_MAT == +ffn_out-12{1280, 7, 1, 1} n=4352 4.605227 + 0.0831 1.0831 2.0831..., 1277.0831 1278.0831 1279.0831 + + +node_546{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 +ffn_out-12{1280, 7, 1, 1} n=1280 4.605227 + 0.0831 1.0831 2.0831..., 1277.0831 1278.0831 1279.0831 +ADD == +l_out-12{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 + + +l_out-12{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 +RMS_NORM == +norm-13{1280, 7, 1, 1} n=1280 -34.321598 + -0.1048 0.8952 1.8952..., 1276.8951 1277.8951 1278.8951 + + +norm-13{1280, 7, 1, 1} n=1280 -16.511385 + -0.4195 0.5805 1.5805..., 1276.5804 1277.5804 1278.5804 +blk.13.attn_norm.weight{1280, 1, 1, 1} n=1280 2264.341064 + 4.0028 5.0028 6.0028..., 1281.0028 1282.0028 1283.0028 +MUL == +attn_norm-13{1280, 7, 1, 1} n=1280 -16.511385 + -0.4195 0.5805 1.5805..., 1276.5804 1277.5804 1278.5804 + + +blk.13.attn_qkv.weight{1280, 1920, 1, 1} n=1280 0.000000 + 0.2111 0.2112 0.2113..., 0.4839 0.4841 0.4844 +attn_norm-13{1280, 7, 1, 1} n=1280 -16.511385 + -0.4195 0.5805 1.5805..., 1276.5804 1277.5804 1278.5804 +MUL_MAT == +wqkv-13{1920, 7, 1, 1} n=1280 46.203876 + -2.8563 -1.8563 -0.8563..., 1914.1437 1915.1437 1916.1437 + + +wqkv-13{1920, 7, 1, 1} n=1920 46.203876 + -2.8563 -1.8563 -0.8563..., 1914.1437 1915.1437 1916.1437 +VIEW == +wqkv-13 (view){64, 7, 20, 1} n=1920 -3.039977 + -2.8563 -1.8563 -0.8563..., 58.1437 59.1437 60.1437 + + +wqkv-13 (view){64, 7, 20, 1} n=64 -3.039977 + -2.8563 -1.8563 -0.8563..., 58.1437 59.1437 60.1437 +CONT == +Qcur-13{64, 7, 20, 1} n=64 -3.039977 + -2.8563 -1.8563 -0.8563..., 58.1437 59.1437 60.1437 + + +Qcur-13{64, 7, 20, 1} n=64 -2.152671 + -2.0226 -1.0226 -0.0226..., 58.9774 59.9774 60.9774 +RMS_NORM == +norm-13{64, 7, 20, 1} n=64 -2.152671 + -2.0226 -1.0226 -0.0226..., 58.9774 59.9774 60.9774 + + +norm-13{64, 7, 20, 1} n=64 7.710611 + -0.0407 0.9593 1.9593..., 60.9593 61.9593 62.9593 +blk.13.attn_q_norm.weight{64, 1, 1, 1} n=64 104.134888 + 0.0201 1.0201 2.0201..., 61.0201 62.0201 63.0201 +MUL == +Qcur-13{64, 7, 20, 1} n=64 7.710611 + -0.0407 0.9593 1.9593..., 60.9593 61.9593 62.9593 + + +Qcur-13{64, 7, 20, 1} n=64 7.710611 + -0.0407 0.9593 1.9593..., 60.9593 61.9593 62.9593 +RESHAPE == +Qcur-13 (reshaped){64, 20, 7, 1} n=64 7.710611 + -0.0407 0.9593 1.9593..., 60.9593 61.9593 62.9593 + + +Qcur-13 (reshaped){64, 20, 7, 1} n=64 7.710611 + -0.0407 0.9593 1.9593..., 60.9593 61.9593 62.9593 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-13{64, 20, 7, 1} n=64 7.710611 + -0.0407 0.9593 1.9593..., 60.9593 61.9593 62.9593 + + +Qcur-13{64, 20, 7, 1} n=64 0.963826 + -0.0051 0.9949 1.9949..., 60.9949 61.9949 62.9949 +SCALE == +Qcur-13{64, 20, 7, 1} n=64 0.963826 + -0.0051 0.9949 1.9949..., 60.9949 61.9949 62.9949 + + +wqkv-13{1920, 7, 1, 1} n=1920 46.203876 + -2.8563 -1.8563 -0.8563..., 1914.1437 1915.1437 1916.1437 +VIEW == +wqkv-13 (view){64, 7, 5, 1} n=1920 16.552410 + 4.1546 5.1546 6.1546..., 65.1546 66.1546 67.1546 + + +wqkv-13 (view){64, 7, 5, 1} n=64 16.552410 + 4.1546 5.1546 6.1546..., 65.1546 66.1546 67.1546 +CONT == +Kcur-13{64, 7, 5, 1} n=64 16.552410 + 4.1546 5.1546 6.1546..., 65.1546 66.1546 67.1546 + + +Kcur-13{64, 7, 5, 1} n=64 14.764525 + 3.7058 4.7058 5.7058..., 64.7058 65.7058 66.7058 +RMS_NORM == +norm-13{64, 7, 5, 1} n=64 14.764525 + 3.7058 4.7058 5.7058..., 64.7058 65.7058 66.7058 + + +norm-13{64, 7, 5, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 +blk.13.attn_k_norm.weight{64, 1, 1, 1} n=64 101.292580 + -0.0129 0.9871 1.9871..., 60.9871 61.9871 62.9871 +MUL == +Kcur-13{64, 7, 5, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 + + +Kcur-13{64, 7, 5, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 +RESHAPE == +Kcur-13 (reshaped){64, 5, 7, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 + + +Kcur-13 (reshaped){64, 5, 7, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_575{64, 5, 7, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 + + +node_575{64, 5, 7, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 +REPEAT == +node_576{64, 5, 28, 1} n=64 18.933956 + -0.0480 0.9520 1.9520..., 60.9520 61.9520 62.9520 + + +wqkv-13{1920, 7, 1, 1} n=1920 46.203876 + -2.8563 -1.8563 -0.8563..., 1914.1437 1915.1437 1916.1437 +VIEW == +wqkv-13 (view){64, 7, 5, 1} n=1920 -116827927160593221305253265724669952.000000 +-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000..., -332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000 + + +wqkv-13 (view){64, 7, 5, 1} n=64 -116827927160593221305253265724669952.000000 +-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000..., -332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000 +CONT == +Vcur-13{64, 7, 5, 1} n=64 -116827927160593221305253265724669952.000000 +-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000..., -332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000 + + +Vcur-13{64, 7, 5, 1} n=64 -116827927160593221305253265724669952.000000 +-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000..., -332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000 +REPEAT == +node_579{64, 7, 20, 1} n=64 -116827927160593221305253265724669952.000000 +-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000..., -332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000-332309494633348167552585961704521728.0000 + + +cache_v_l13{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-13{32, 64, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l13{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-13{64, 32, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-13{64, 20, 7, 1} n=64 0.963826 + -0.0051 0.9949 1.9949..., 60.9949 61.9949 62.9949 +PERMUTE == +q-13{64, 7, 20, 1} n=64 0.963826 + -0.0051 0.9949 1.9949..., 60.9949 61.9949 62.9949 + + +k-13{64, 32, 10, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-13{64, 7, 20, 1} n=64 0.963826 + -0.0051 0.9949 1.9949..., 60.9949 61.9949 62.9949 +MUL_MAT == +kq-13{32, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-13{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-13{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-13{32, 64, 10, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-13{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-13{64, 7, 20, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-13{64, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-13{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-13{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-13{1280, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +blk.13.attn_output.weight{1280, 1280, 1, 1} n=1280 0.000146 + -0.0258 -0.0258 -0.0259..., -0.0594 -0.0594 -0.0594 +kqv_merged_cont-13{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +MUL_MAT == +kqv_out-13{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-13{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 +l_out-12{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 +ADD == +node_589{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 + + +node_589{1280, 7, 1, 1} n=1280 -1438.913452 + -4.3938 -3.3938 -2.3938..., 1272.6062 1273.6062 1274.6062 +RMS_NORM == +norm-13{1280, 7, 1, 1} n=1280 -34.321598 + -0.1048 0.8952 1.8952..., 1276.8951 1277.8951 1278.8951 + + +norm-13{1280, 7, 1, 1} n=1280 -14.130329 + -0.0413 0.9587 1.9587..., 1276.9587 1277.9587 1278.9587 +blk.13.ffn_norm.weight{1280, 1, 1, 1} n=1280 974.911194 + 0.3938 1.3938 2.3938..., 1277.3938 1278.3938 1279.3938 +MUL == +ffn_norm-13{1280, 7, 1, 1} n=1280 -14.130329 + -0.0413 0.9587 1.9587..., 1276.9587 1277.9587 1278.9587 + + +ffn_norm-13{1280, 7, 1, 1} n=1280 -33.838680 + -0.0988 0.9012 1.9012..., 1276.9011 1277.9011 1278.9011 +RMS_NORM == +norm-13{1280, 7, 1, 1} n=1280 -33.838680 + -0.0988 0.9012 1.9012..., 1276.9011 1277.9011 1278.9011 + + +norm-13{1280, 7, 1, 1} n=1280 -14.019225 + -0.0389 0.9611 1.9611..., 1276.9611 1277.9611 1278.9611 +blk.13.ffn_norm.weight{1280, 1, 1, 1} n=1280 974.911194 + 0.3938 1.3938 2.3938..., 1277.3938 1278.3938 1279.3938 +MUL == +ffn_norm-13{1280, 7, 1, 1} n=1280 -14.019225 + -0.0389 0.9611 1.9611..., 1276.9611 1277.9611 1278.9611 + + +blk.13.ffn_up.weight{1280, 9216, 1, 1} n=1280 0.000000 + -0.0619 -0.0620 -0.0620..., -0.1536 -0.1537 -0.1538 +ffn_norm-13{1280, 7, 1, 1} n=1280 -14.019225 + -0.0389 0.9611 1.9611..., 1276.9611 1277.9611 1278.9611 +MUL_MAT == +ffn_silu-13{9216, 7, 1, 1} n=1280 -883.181763 + 0.3391 1.3391 2.3391..., 9213.3389 9214.3389 9215.3389 + + +ffn_silu-13{9216, 7, 1, 1} n=9216 -883.181763 + 0.3391 1.3391 2.3391..., 9213.3389 9214.3389 9215.3389 +VIEW == +ffn_up-13 (view){4608, 7, 1, 1} n=9216 -912.154785 + 0.3391 1.3391 2.3391..., 4605.3389 4606.3389 4607.3389 + + +ffn_up-13 (view){4608, 7, 1, 1} n=4608 -912.154785 + 0.3391 1.3391 2.3391..., 4605.3389 4606.3389 4607.3389 +CONT == +ffn_up-13 (view) (cont){4608, 7, 1, 1} n=4608 -912.154785 + 0.3391 1.3391 2.3391..., 4605.3389 4606.3389 4607.3389 + + +ffn_up-13 (view) (cont){4608, 7, 1, 1} n=4608 13.798156 + 0.1980 1.1980 2.1980..., 4605.1982 4606.1982 4607.1982 +SILU == +node_597{4608, 7, 1, 1} n=4608 13.798156 + 0.1980 1.1980 2.1980..., 4605.1982 4606.1982 4607.1982 + + +ffn_silu-13{9216, 7, 1, 1} n=9216 -883.181763 + 0.3391 1.3391 2.3391..., 9213.3389 9214.3389 9215.3389 +VIEW == +ffn_up-13 (view){4608, 7, 1, 1} n=9216 28.973198 + -0.6168 0.3832 1.3832..., 4604.3833 4605.3833 4606.3833 + + +ffn_up-13 (view){4608, 7, 1, 1} n=4608 28.973198 + -0.6168 0.3832 1.3832..., 4604.3833 4605.3833 4606.3833 +CONT == +ffn_up-13 (view) (cont){4608, 7, 1, 1} n=4608 28.973198 + -0.6168 0.3832 1.3832..., 4604.3833 4605.3833 4606.3833 + + +node_597{4608, 7, 1, 1} n=4608 -3.144649 + -0.1221 0.8779 1.8779..., 4604.8779 4605.8779 4606.8779 +ffn_up-13 (view) (cont){4608, 7, 1, 1} n=4608 28.973198 + -0.6168 0.3832 1.3832..., 4604.3833 4605.3833 4606.3833 +MUL == +ffn_mul-13{4608, 7, 1, 1} n=4608 -3.144649 + -0.1221 0.8779 1.8779..., 4604.8779 4605.8779 4606.8779 + + +blk.13.ffn_down.weight{4608, 1280, 1, 1} n=4608 -0.000001 + -0.0022 -0.0022 -0.0022..., -0.0512 -0.0513 -0.0513 +ffn_mul-13{4608, 7, 1, 1} n=4608 -3.144649 + -0.1221 0.8779 1.8779..., 4604.8779 4605.8779 4606.8779 +MUL_MAT == +ffn_out-13{1280, 7, 1, 1} n=4608 13.666017 + 0.6968 1.6968 2.6968..., 1277.6968 1278.6968 1279.6968 + + +node_589{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 +ffn_out-13{1280, 7, 1, 1} n=1280 13.666017 + 0.6968 1.6968 2.6968..., 1277.6968 1278.6968 1279.6968 +ADD == +l_out-13{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 + + +l_out-13{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 +RMS_NORM == +norm-14{1280, 7, 1, 1} n=1280 -33.955437 + -0.0881 0.9119 1.9119..., 1276.9119 1277.9119 1278.9119 + + +norm-14{1280, 7, 1, 1} n=1280 -15.127797 + -0.5406 0.4594 1.4594..., 1276.4595 1277.4595 1278.4595 +blk.14.attn_norm.weight{1280, 1, 1, 1} n=1280 2845.149658 + 6.1373 7.1373 8.1373..., 1283.1373 1284.1373 1285.1373 +MUL == +attn_norm-14{1280, 7, 1, 1} n=1280 -15.127797 + -0.5406 0.4594 1.4594..., 1276.4595 1277.4595 1278.4595 + + +blk.14.attn_qkv.weight{1280, 1920, 1, 1} n=1280 0.000004 + -0.0238 -0.0238 -0.0238..., -0.0552 -0.0553 -0.0553 +attn_norm-14{1280, 7, 1, 1} n=1280 -15.127797 + -0.5406 0.4594 1.4594..., 1276.4595 1277.4595 1278.4595 +MUL_MAT == +wqkv-14{1920, 7, 1, 1} n=1280 102.708748 + -0.9884 0.0116 1.0116..., 1916.0116 1917.0116 1918.0116 + + +wqkv-14{1920, 7, 1, 1} n=1920 102.708748 + -0.9884 0.0116 1.0116..., 1916.0116 1917.0116 1918.0116 +VIEW == +wqkv-14 (view){64, 7, 20, 1} n=1920 17.370770 + -0.9884 0.0116 1.0116..., 60.0116 61.0116 62.0116 + + +wqkv-14 (view){64, 7, 20, 1} n=64 17.370770 + -0.9884 0.0116 1.0116..., 60.0116 61.0116 62.0116 +CONT == +Qcur-14{64, 7, 20, 1} n=64 17.370770 + -0.9884 0.0116 1.0116..., 60.0116 61.0116 62.0116 + + +Qcur-14{64, 7, 20, 1} n=64 9.009232 + -0.5126 0.4874 1.4874..., 60.4874 61.4874 62.4874 +RMS_NORM == +norm-14{64, 7, 20, 1} n=64 9.009232 + -0.5126 0.4874 1.4874..., 60.4874 61.4874 62.4874 + + +norm-14{64, 7, 20, 1} n=64 -0.750684 + -0.6686 0.3314 1.3314..., 60.3314 61.3314 62.3314 +blk.14.attn_q_norm.weight{64, 1, 1, 1} n=64 104.558411 + 1.3042 2.3042 3.3042..., 62.3042 63.3042 64.3042 +MUL == +Qcur-14{64, 7, 20, 1} n=64 -0.750684 + -0.6686 0.3314 1.3314..., 60.3314 61.3314 62.3314 + + +Qcur-14{64, 7, 20, 1} n=64 -0.750684 + -0.6686 0.3314 1.3314..., 60.3314 61.3314 62.3314 +RESHAPE == +Qcur-14 (reshaped){64, 20, 7, 1} n=64 -0.750684 + -0.6686 0.3314 1.3314..., 60.3314 61.3314 62.3314 + + +Qcur-14 (reshaped){64, 20, 7, 1} n=64 -0.750684 + -0.6686 0.3314 1.3314..., 60.3314 61.3314 62.3314 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-14{64, 20, 7, 1} n=64 -0.750684 + -0.6686 0.3314 1.3314..., 60.3314 61.3314 62.3314 + + +Qcur-14{64, 20, 7, 1} n=64 -0.093836 + -0.0836 0.9164 1.9164..., 60.9164 61.9164 62.9164 +SCALE == +Qcur-14{64, 20, 7, 1} n=64 -0.093836 + -0.0836 0.9164 1.9164..., 60.9164 61.9164 62.9164 + + +wqkv-14{1920, 7, 1, 1} n=1920 102.708748 + -0.9884 0.0116 1.0116..., 1916.0116 1917.0116 1918.0116 +VIEW == +wqkv-14 (view){64, 7, 5, 1} n=1920 18.503204 + 2.9457 3.9457 4.9457..., 63.9457 64.9457 65.9457 + + +wqkv-14 (view){64, 7, 5, 1} n=64 18.503204 + 2.9457 3.9457 4.9457..., 63.9457 64.9457 65.9457 +CONT == +Kcur-14{64, 7, 5, 1} n=64 18.503204 + 2.9457 3.9457 4.9457..., 63.9457 64.9457 65.9457 + + +Kcur-14{64, 7, 5, 1} n=64 11.114650 + 1.7695 2.7695 3.7695..., 62.7695 63.7695 64.7695 +RMS_NORM == +norm-14{64, 7, 5, 1} n=64 11.114650 + 1.7695 2.7695 3.7695..., 62.7695 63.7695 64.7695 + + +norm-14{64, 7, 5, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 +blk.14.attn_k_norm.weight{64, 1, 1, 1} n=64 100.335892 + 1.0633 2.0633 3.0633..., 62.0633 63.0633 64.0633 +MUL == +Kcur-14{64, 7, 5, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 + + +Kcur-14{64, 7, 5, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 +RESHAPE == +Kcur-14 (reshaped){64, 5, 7, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 + + +Kcur-14 (reshaped){64, 5, 7, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_618{64, 5, 7, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 + + +node_618{64, 5, 7, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 +REPEAT == +node_619{64, 5, 28, 1} n=64 19.317211 + 1.8815 2.8815 3.8815..., 62.8815 63.8815 64.8815 + + +wqkv-14{1920, 7, 1, 1} n=1920 102.708748 + -0.9884 0.0116 1.0116..., 1916.0116 1917.0116 1918.0116 +VIEW == +wqkv-14 (view){64, 7, 5, 1} n=1920 -2361222349430717428662273139776946176.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +wqkv-14 (view){64, 7, 5, 1} n=64 -2361222349430717428662273139776946176.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +Vcur-14{64, 7, 5, 1} n=64 -2361222349430717428662273139776946176.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +Vcur-14{64, 7, 5, 1} n=64 -2361222349430717428662273139776946176.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +REPEAT == +node_622{64, 7, 20, 1} n=64 -2361222349430717428662273139776946176.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +cache_v_l14{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-14{32, 64, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l14{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-14{64, 32, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-14{64, 20, 7, 1} n=64 -0.093836 + -0.0836 0.9164 1.9164..., 60.9164 61.9164 62.9164 +PERMUTE == +q-14{64, 7, 20, 1} n=64 -0.093836 + -0.0836 0.9164 1.9164..., 60.9164 61.9164 62.9164 + + +k-14{64, 32, 10, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-14{64, 7, 20, 1} n=64 -0.093836 + -0.0836 0.9164 1.9164..., 60.9164 61.9164 62.9164 +MUL_MAT == +kq-14{32, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-14{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-14{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-14{32, 64, 10, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-14{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-14{64, 7, 20, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-14{64, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-14{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-14{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-14{1280, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +blk.14.attn_output.weight{1280, 1280, 1, 1} n=1280 0.000001 + 0.0002 0.0002 0.0002..., 0.0006 0.0006 0.0006 +kqv_merged_cont-14{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +MUL_MAT == +kqv_out-14{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-14{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 +l_out-13{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 +ADD == +node_632{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 + + +node_632{1280, 7, 1, 1} n=1280 -1425.250854 + -3.6971 -2.6971 -1.6971..., 1273.3030 1274.3030 1275.3030 +RMS_NORM == +norm-14{1280, 7, 1, 1} n=1280 -33.955437 + -0.0881 0.9119 1.9119..., 1276.9119 1277.9119 1278.9119 + + +norm-14{1280, 7, 1, 1} n=1280 -20.253294 + -0.0457 0.9543 1.9543..., 1276.9543 1277.9543 1278.9543 +blk.14.ffn_norm.weight{1280, 1, 1, 1} n=1280 1132.525146 + 0.5188 1.5188 2.5188..., 1277.5188 1278.5188 1279.5188 +MUL == +ffn_norm-14{1280, 7, 1, 1} n=1280 -20.253294 + -0.0457 0.9543 1.9543..., 1276.9543 1277.9543 1278.9543 + + +ffn_norm-14{1280, 7, 1, 1} n=1280 -34.118149 + -0.0770 0.9230 1.9230..., 1276.9230 1277.9230 1278.9230 +RMS_NORM == +norm-14{1280, 7, 1, 1} n=1280 -34.118149 + -0.0770 0.9230 1.9230..., 1276.9230 1277.9230 1278.9230 + + +norm-14{1280, 7, 1, 1} n=1280 -20.053419 + -0.0399 0.9601 1.9601..., 1276.9601 1277.9601 1278.9601 +blk.14.ffn_norm.weight{1280, 1, 1, 1} n=1280 1132.525146 + 0.5188 1.5188 2.5188..., 1277.5188 1278.5188 1279.5188 +MUL == +ffn_norm-14{1280, 7, 1, 1} n=1280 -20.053419 + -0.0399 0.9601 1.9601..., 1276.9601 1277.9601 1278.9601 + + +blk.14.ffn_up.weight{1280, 9728, 1, 1} n=1280 -0.000000 + -0.0421 -0.0421 -0.0421..., -0.0996 -0.0997 -0.0997 +ffn_norm-14{1280, 7, 1, 1} n=1280 -20.053419 + -0.0399 0.9601 1.9601..., 1276.9601 1277.9601 1278.9601 +MUL_MAT == +ffn_silu-14{9728, 7, 1, 1} n=1280 -1493.376587 + -0.6568 0.3432 1.3432..., 9724.3428 9725.3428 9726.3428 + + +ffn_silu-14{9728, 7, 1, 1} n=9728 -1493.376587 + -0.6568 0.3432 1.3432..., 9724.3428 9725.3428 9726.3428 +VIEW == +ffn_up-14 (view){4864, 7, 1, 1} n=9728 -1484.522217 + -0.6568 0.3432 1.3432..., 4860.3433 4861.3433 4862.3433 + + +ffn_up-14 (view){4864, 7, 1, 1} n=4864 -1484.522217 + -0.6568 0.3432 1.3432..., 4860.3433 4861.3433 4862.3433 +CONT == +ffn_up-14 (view) (cont){4864, 7, 1, 1} n=4864 -1484.522217 + -0.6568 0.3432 1.3432..., 4860.3433 4861.3433 4862.3433 + + +ffn_up-14 (view) (cont){4864, 7, 1, 1} n=4864 86.553650 + -0.2242 0.7758 1.7758..., 4860.7759 4861.7759 4862.7759 +SILU == +node_640{4864, 7, 1, 1} n=4864 86.553650 + -0.2242 0.7758 1.7758..., 4860.7759 4861.7759 4862.7759 + + +ffn_silu-14{9728, 7, 1, 1} n=9728 -1493.376587 + -0.6568 0.3432 1.3432..., 9724.3428 9725.3428 9726.3428 +VIEW == +ffn_up-14 (view){4864, 7, 1, 1} n=9728 -8.857319 + -0.4931 0.5069 1.5069..., 4860.5068 4861.5068 4862.5068 + + +ffn_up-14 (view){4864, 7, 1, 1} n=4864 -8.857319 + -0.4931 0.5069 1.5069..., 4860.5068 4861.5068 4862.5068 +CONT == +ffn_up-14 (view) (cont){4864, 7, 1, 1} n=4864 -8.857319 + -0.4931 0.5069 1.5069..., 4860.5068 4861.5068 4862.5068 + + +node_640{4864, 7, 1, 1} n=4864 8.030127 + 0.1106 1.1106 2.1106..., 4861.1104 4862.1104 4863.1104 +ffn_up-14 (view) (cont){4864, 7, 1, 1} n=4864 -8.857319 + -0.4931 0.5069 1.5069..., 4860.5068 4861.5068 4862.5068 +MUL == +ffn_mul-14{4864, 7, 1, 1} n=4864 8.030127 + 0.1106 1.1106 2.1106..., 4861.1104 4862.1104 4863.1104 + + +blk.14.ffn_down.weight{4864, 1280, 1, 1} n=4864 0.000000 + 0.0042 0.0042 0.0042..., 0.1132 0.1133 0.1133 +ffn_mul-14{4864, 7, 1, 1} n=4864 8.030127 + 0.1106 1.1106 2.1106..., 4861.1104 4862.1104 4863.1104 +MUL_MAT == +ffn_out-14{1280, 7, 1, 1} n=4864 27.503433 + 0.4625 1.4625 2.4625..., 1277.4625 1278.4625 1279.4625 + + +node_632{1280, 7, 1, 1} n=1280 -1397.746338 + -3.2346 -2.2346 -1.2346..., 1273.7654 1274.7654 1275.7654 +ffn_out-14{1280, 7, 1, 1} n=1280 27.503433 + 0.4625 1.4625 2.4625..., 1277.4625 1278.4625 1279.4625 +ADD == +l_out-14{1280, 7, 1, 1} n=1280 -1397.746338 + -3.2346 -2.2346 -1.2346..., 1273.7654 1274.7654 1275.7654 + + +l_out-14{1280, 7, 1, 1} n=1280 -1397.746338 + -3.2346 -2.2346 -1.2346..., 1273.7654 1274.7654 1275.7654 +RMS_NORM == +norm-15{1280, 7, 1, 1} n=1280 -33.309299 + -0.0771 0.9229 1.9229..., 1276.9230 1277.9230 1278.9230 + + +norm-15{1280, 7, 1, 1} n=1280 -41.146503 + -0.7712 0.2288 1.2288..., 1276.2289 1277.2289 1278.2289 +blk.15.attn_norm.weight{1280, 1, 1, 1} n=1280 4081.196533 + 10.0043 11.0043 12.0043..., 1287.0043 1288.0043 1289.0043 +MUL == +attn_norm-15{1280, 7, 1, 1} n=1280 -41.146503 + -0.7712 0.2288 1.2288..., 1276.2289 1277.2289 1278.2289 + + +blk.15.attn_qkv.weight{1280, 1920, 1, 1} n=1280 -0.000003 + 0.0121 0.0121 0.0121..., 0.0280 0.0280 0.0281 +attn_norm-15{1280, 7, 1, 1} n=1280 -41.146503 + -0.7712 0.2288 1.2288..., 1276.2289 1277.2289 1278.2289 +MUL_MAT == +wqkv-15{1920, 7, 1, 1} n=1280 -160.707581 + -10.9706 -9.9706 -8.9706..., 1906.0293 1907.0293 1908.0293 + + +wqkv-15{1920, 7, 1, 1} n=1920 -160.707581 + -10.9706 -9.9706 -8.9706..., 1906.0293 1907.0293 1908.0293 +VIEW == +wqkv-15 (view){64, 7, 20, 1} n=1920 -29.722378 + -10.9706 -9.9706 -8.9706..., 50.0294 51.0294 52.0294 + + +wqkv-15 (view){64, 7, 20, 1} n=64 -29.722378 + -10.9706 -9.9706 -8.9706..., 50.0294 51.0294 52.0294 +CONT == +Qcur-15{64, 7, 20, 1} n=64 -29.722378 + -10.9706 -9.9706 -8.9706..., 50.0294 51.0294 52.0294 + + +Qcur-15{64, 7, 20, 1} n=64 -8.298569 + -3.0630 -2.0630 -1.0630..., 57.9370 58.9370 59.9370 +RMS_NORM == +norm-15{64, 7, 20, 1} n=64 -8.298569 + -3.0630 -2.0630 -1.0630..., 57.9370 58.9370 59.9370 + + +norm-15{64, 7, 20, 1} n=64 -9.528766 + -0.0702 0.9298 1.9298..., 60.9298 61.9298 62.9298 +blk.15.attn_q_norm.weight{64, 1, 1, 1} n=64 88.068245 + 0.0229 1.0229 2.0229..., 61.0229 62.0229 63.0229 +MUL == +Qcur-15{64, 7, 20, 1} n=64 -9.528766 + -0.0702 0.9298 1.9298..., 60.9298 61.9298 62.9298 + + +Qcur-15{64, 7, 20, 1} n=64 -9.528766 + -0.0702 0.9298 1.9298..., 60.9298 61.9298 62.9298 +RESHAPE == +Qcur-15 (reshaped){64, 20, 7, 1} n=64 -9.528766 + -0.0702 0.9298 1.9298..., 60.9298 61.9298 62.9298 + + +Qcur-15 (reshaped){64, 20, 7, 1} n=64 -9.528766 + -0.0702 0.9298 1.9298..., 60.9298 61.9298 62.9298 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +Qcur-15{64, 20, 7, 1} n=64 -9.528766 + -0.0702 0.9298 1.9298..., 60.9298 61.9298 62.9298 + + +Qcur-15{64, 20, 7, 1} n=64 -1.191096 + -0.0088 0.9912 1.9912..., 60.9912 61.9912 62.9912 +SCALE == +Qcur-15{64, 20, 7, 1} n=64 -1.191096 + -0.0088 0.9912 1.9912..., 60.9912 61.9912 62.9912 + + +wqkv-15{1920, 7, 1, 1} n=1920 -160.707581 + -10.9706 -9.9706 -8.9706..., 1906.0293 1907.0293 1908.0293 +VIEW == +wqkv-15 (view){64, 7, 5, 1} n=1920 4.053219 + -4.0447 -3.0447 -2.0447..., 56.9553 57.9553 58.9553 + + +wqkv-15 (view){64, 7, 5, 1} n=64 4.053219 + -4.0447 -3.0447 -2.0447..., 56.9553 57.9553 58.9553 +CONT == +Kcur-15{64, 7, 5, 1} n=64 4.053219 + -4.0447 -3.0447 -2.0447..., 56.9553 57.9553 58.9553 + + +Kcur-15{64, 7, 5, 1} n=64 1.306365 + -1.3036 -0.3036 0.6964..., 59.6964 60.6964 61.6964 +RMS_NORM == +norm-15{64, 7, 5, 1} n=64 1.306365 + -1.3036 -0.3036 0.6964..., 59.6964 60.6964 61.6964 + + +norm-15{64, 7, 5, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 +blk.15.attn_k_norm.weight{64, 1, 1, 1} n=64 88.102585 + -0.0475 0.9525 1.9525..., 60.9525 61.9525 62.9525 +MUL == +Kcur-15{64, 7, 5, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 + + +Kcur-15{64, 7, 5, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 +RESHAPE == +Kcur-15 (reshaped){64, 5, 7, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 + + +Kcur-15 (reshaped){64, 5, 7, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 +inp_pos{7, 1, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 4.0000 5.0000 6.0000 +ROPE == +node_661{64, 5, 7, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 + + +node_661{64, 5, 7, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 +REPEAT == +node_662{64, 5, 28, 1} n=64 9.258093 + 0.0620 1.0620 2.0620..., 61.0620 62.0620 63.0620 + + +wqkv-15{1920, 7, 1, 1} n=1920 -160.707581 + -10.9706 -9.9706 -8.9706..., 1906.0293 1907.0293 1908.0293 +VIEW == +wqkv-15 (view){64, 7, 5, 1} n=1920 nan + nan nan nan..., nan nan nan + + +wqkv-15 (view){64, 7, 5, 1} n=64 nan + nan nan nan..., nan nan nan +CONT == +Vcur-15{64, 7, 5, 1} n=64 nan + nan nan nan..., nan nan nan + + +Vcur-15{64, 7, 5, 1} n=64 nan + nan nan nan..., nan nan nan +REPEAT == +node_665{64, 7, 20, 1} n=64 nan + nan nan nan..., nan nan nan + + +cache_v_l15{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +v-15{32, 64, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +cache_k_l15{40960, 1, 1, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., -0.0078 -0.0078 -0.0078 +VIEW == +k-15{64, 32, 10, 1} n=40960 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 + + +Qcur-15{64, 20, 7, 1} n=64 -1.191096 + -0.0088 0.9912 1.9912..., 60.9912 61.9912 62.9912 +PERMUTE == +q-15{64, 7, 20, 1} n=64 -1.191096 + -0.0088 0.9912 1.9912..., 60.9912 61.9912 62.9912 + + +k-15{64, 32, 10, 1} n=64 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +q-15{64, 7, 20, 1} n=64 -1.191096 + -0.0088 0.9912 1.9912..., 60.9912 61.9912 62.9912 +MUL_MAT == +kq-15{32, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 + + +kq-15{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +KQ_mask{32, 7, 1, 1} n=32 -inf + 0.0000 1.0000 2.0000..., 29.0000 30.0000 31.0000 +SOFT_MAX == +kq_soft_max_ext-15{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 + + +v-15{32, 64, 10, 1} n=32 0.000000 + 0.0000 0.0000 0.0000..., 0.0000 0.0000 0.0000 +kq_soft_max_ext-15{32, 7, 20, 1} n=32 1.000000 + 1.0000 2.0000 3.0000..., 30.0000 31.0000 32.0000 +MUL_MAT == +kqv-15{64, 7, 20, 1} n=32 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv-15{64, 7, 20, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +PERMUTE == +kqv_merged-15{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 + + +kqv_merged-15{64, 20, 7, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 61.0000 62.0000 63.0000 +CONT == +kqv_merged_cont-15{1280, 7, 1, 1} n=64 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +blk.15.attn_output.weight{1280, 1280, 1, 1} n=1280 0.000018 + -0.1281 -0.1282 -0.1283..., -0.3179 -0.3181 -0.3184 +kqv_merged_cont-15{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +MUL_MAT == +kqv_out-15{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +kqv_out-15{1280, 7, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 +inp_out_ids{1, 1, 1, 1} n=1280 0.000000 + 6.0000 +GET_ROWS == +node_675{1280, 1, 1, 1} n=1280 0.000000 + 0.0000 1.0000 2.0000..., 1277.0000 1278.0000 1279.0000 + + +l_out-14{1280, 7, 1, 1} n=1280 -1397.746338 + -3.2346 -2.2346 -1.2346..., 1273.7654 1274.7654 1275.7654 +inp_out_ids{1, 1, 1, 1} n=1280 0.000000 + 6.0000 +GET_ROWS == +node_676{1280, 1, 1, 1} n=1280 -152.433640 + -12.3000 -11.3000 -10.3000..., 1264.7000 1265.7000 1266.7000 + + +node_675{1280, 1, 1, 1} n=1280 -152.433640 + -12.3000 -11.3000 -10.3000..., 1264.7000 1265.7000 1266.7000 +node_676{1280, 1, 1, 1} n=1280 -152.433640 + -12.3000 -11.3000 -10.3000..., 1264.7000 1265.7000 1266.7000 +ADD == +node_677{1280, 1, 1, 1} n=1280 -152.433640 + -12.3000 -11.3000 -10.3000..., 1264.7000 1265.7000 1266.7000 + + +node_677{1280, 1, 1, 1} n=1280 -152.433640 + -12.3000 -11.3000 -10.3000..., 1264.7000 1265.7000 1266.7000 +RMS_NORM == +norm-15{1280, 1, 1, 1} n=1280 -12.154848 + -0.9808 0.0192 1.0192..., 1276.0192 1277.0192 1278.0192 + + +norm-15{1280, 1, 1, 1} n=1280 15.259125 + -0.6830 0.3170 1.3170..., 1276.3170 1277.3170 1278.3170 +blk.15.ffn_norm.weight{1280, 1, 1, 1} n=1280 1395.995605 + 0.6963 1.6963 2.6963..., 1277.6963 1278.6963 1279.6963 +MUL == +ffn_norm-15{1280, 1, 1, 1} n=1280 15.259125 + -0.6830 0.3170 1.3170..., 1276.3170 1277.3170 1278.3170 + + +ffn_norm-15{1280, 1, 1, 1} n=1280 19.340216 + -0.8656 0.1344 1.1344..., 1276.1344 1277.1344 1278.1344 +RMS_NORM == +norm-15{1280, 1, 1, 1} n=1280 19.340216 + -0.8656 0.1344 1.1344..., 1276.1344 1277.1344 1278.1344 + + +norm-15{1280, 1, 1, 1} n=1280 11.352558 + -0.6028 0.3972 1.3972..., 1276.3972 1277.3972 1278.3972 +blk.15.ffn_norm.weight{1280, 1, 1, 1} n=1280 1395.995605 + 0.6963 1.6963 2.6963..., 1277.6963 1278.6963 1279.6963 +MUL == +ffn_norm-15{1280, 1, 1, 1} n=1280 11.352558 + -0.6028 0.3972 1.3972..., 1276.3972 1277.3972 1278.3972 + + +blk.15.ffn_up.weight{1280, 10240, 1, 1} n=1280 -0.000000 + 0.0294 0.0294 0.0294..., 0.0706 0.0707 0.0707 +ffn_norm-15{1280, 1, 1, 1} n=1280 11.352558 + -0.6028 0.3972 1.3972..., 1276.3972 1277.3972 1278.3972 +MUL_MAT == +ffn_silu-15{10240, 1, 1, 1} n=1280 -3317.586914 + -1.3093 -0.3093 0.6907..., 10235.6904 10236.6904 10237.6904 + + +ffn_silu-15{10240, 1, 1, 1} n=10240 -3317.586914 + -1.3093 -0.3093 0.6907..., 10235.6904 10236.6904 10237.6904 +VIEW == +ffn_up-15 (view){5120, 1, 1, 1} n=10240 -3056.639160 + -1.3093 -0.3093 0.6907..., 5115.6904 5116.6904 5117.6904 + + +ffn_up-15 (view){5120, 1, 1, 1} n=5120 -3056.639160 + -1.3093 -0.3093 0.6907..., 5115.6904 5116.6904 5117.6904 +CONT == +ffn_up-15 (view) (cont){5120, 1, 1, 1} n=5120 -3056.639160 + -1.3093 -0.3093 0.6907..., 5115.6904 5116.6904 5117.6904 + + +ffn_up-15 (view) (cont){5120, 1, 1, 1} n=5120 1981.302979 + -0.2783 0.7217 1.7217..., 5116.7217 5117.7217 5118.7217 +SILU == +node_685{5120, 1, 1, 1} n=5120 1981.302979 + -0.2783 0.7217 1.7217..., 5116.7217 5117.7217 5118.7217 + + +ffn_silu-15{10240, 1, 1, 1} n=10240 -3317.586914 + -1.3093 -0.3093 0.6907..., 10235.6904 10236.6904 10237.6904 +VIEW == +ffn_up-15 (view){5120, 1, 1, 1} n=10240 -260.945343 + -1.3088 -0.3088 0.6912..., 5115.6914 5116.6914 5117.6914 + + +ffn_up-15 (view){5120, 1, 1, 1} n=5120 -260.945343 + -1.3088 -0.3088 0.6912..., 5115.6914 5116.6914 5117.6914 +CONT == +ffn_up-15 (view) (cont){5120, 1, 1, 1} n=5120 -260.945343 + -1.3088 -0.3088 0.6912..., 5115.6914 5116.6914 5117.6914 + + +node_685{5120, 1, 1, 1} n=5120 -56.715477 + 0.3643 1.3643 2.3643..., 5117.3643 5118.3643 5119.3643 +ffn_up-15 (view) (cont){5120, 1, 1, 1} n=5120 -260.945343 + -1.3088 -0.3088 0.6912..., 5115.6914 5116.6914 5117.6914 +MUL == +ffn_mul-15{5120, 1, 1, 1} n=5120 -56.715477 + 0.3643 1.3643 2.3643..., 5117.3643 5118.3643 5119.3643 + + +blk.15.ffn_down.weight{5120, 1280, 1, 1} n=5120 0.000000 + 0.0399 0.0399 0.0400..., 1.2744 1.2754 1.2764 +ffn_mul-15{5120, 1, 1, 1} n=5120 -56.715477 + 0.3643 1.3643 2.3643..., 5117.3643 5118.3643 5119.3643 +MUL_MAT == +ffn_out-15{1280, 1, 1, 1} n=5120 -964.760681 + -7.7104 -6.7104 -5.7104..., 1269.2896 1270.2896 1271.2896 + + +node_677{1280, 1, 1, 1} n=1280 -1117.194092 + -20.0105 -19.0105 -18.0105..., 1256.9895 1257.9895 1258.9895 +ffn_out-15{1280, 1, 1, 1} n=1280 -964.760681 + -7.7104 -6.7104 -5.7104..., 1269.2896 1270.2896 1271.2896 +ADD == +l_out-15{1280, 1, 1, 1} n=1280 -1117.194092 + -20.0105 -19.0105 -18.0105..., 1256.9895 1257.9895 1258.9895 + + +l_out-15{1280, 1, 1, 1} n=1280 -29.900330 + -0.5356 0.4644 1.4644..., 1276.4645 1277.4645 1278.4645 +RMS_NORM == +norm{1280, 1, 1, 1} n=1280 -29.900330 + -0.5356 0.4644 1.4644..., 1276.4645 1277.4645 1278.4645 + + +norm{1280, 1, 1, 1} n=1280 24.229685 + -0.5576 0.4424 1.4424..., 1276.4424 1277.4424 1278.4424 +output_norm.weight{1280, 1, 1, 1} n=1280 2148.671387 + 1.0412 2.0412 3.0412..., 1278.0413 1279.0413 1280.0413 +MUL == +result_norm{1280, 1, 1, 1} n=1280 24.229685 + -0.5576 0.4424 1.4424..., 1276.4424 1277.4424 1278.4424 + + +token_embd.weight{1280, 32000, 1, 1} n=1280 0.002177 + -0.0189 -0.0190 -0.0190..., -0.0456 -0.0456 -0.0457 +result_norm{1280, 1, 1, 1} n=1280 24.229685 + -0.5576 0.4424 1.4424..., 1276.4424 1277.4424 1278.4424 +MUL_MAT == +result_output{32000, 1, 1, 1} n=1280 -91464.156250 + -4.6149 -3.6149 -2.6149..., 31992.3848 31993.3848 31994.3848 + + diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 29b5f3b3c12c8..614242de7bbd7 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -27,54 +27,63 @@ static std::string ggml_ne_string(const ggml_tensor * t) { return str; } -static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { - GGML_ASSERT(n > 0); +static std::string ggml_nb_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string((t->nb[i]/ggml_element_size(t))); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +void print_tensor(const ggml_tensor * src0) { float sum = 0; - for (int64_t i3 = 0; i3 < ne[3]; i3++) { - printf(" [\n"); - for (int64_t i2 = 0; i2 < ne[2]; i2++) { - if (i2 == n && ne[2] > 2*n) { - printf(" ..., \n"); - i2 = ne[2] - n; - } - printf(" [\n"); - for (int64_t i1 = 0; i1 < ne[1]; i1++) { - if (i1 == n && ne[1] > 2*n) { - printf(" ..., \n"); - i1 = ne[1] - n; - } - printf(" ["); - for (int64_t i0 = 0; i0 < ne[0]; i0++) { - if (i0 == n && ne[0] > 2*n) { - printf("..., "); - i0 = ne[0] - n; - } - size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; - float v; - if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i); - } else if (type == GGML_TYPE_F32) { - v = *(float *) data + i; - } else if (type == GGML_TYPE_I32) { - v = (float) *(int32_t *) data + i; - } else if (type == GGML_TYPE_I16) { - v = (float) *(int16_t *) data + i; - } else if (type == GGML_TYPE_I8) { - v = (float) *(int8_t *) data + i; - } else { - GGML_ASSERT(false); - } - printf("%12.4f", v); - sum += v; - if (i0 < ne[0] - 1) printf(", "); - } - printf("],\n"); - } - printf(" ],\n"); + + const int64_t * ne = src0->ne; + int64_t n = 3; + ggml_type type = src0->type; + void * data = src0->data; + + + char *buf = static_cast(malloc(sizeof(char)*ne[0]*8)); + + char *buf2 = buf; + + for (int64_t i = 0; i < 1; i++) { + if (i == n) { + buf2 += sprintf(buf2, "..., "); + } + int64_t offset = i; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + offset); + } else if (type == GGML_TYPE_F32) { + v = *((float *) data + offset); + } else if (type == GGML_TYPE_I32) { + v = (float) *((int32_t *) data + offset); + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) data + offset; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) data + offset; + } else { + GGML_ASSERT(false); + } + if (i < n) { + buf2 += sprintf(buf2, "%12.4f", v); } - printf(" ]\n"); - printf(" sum = %f\n", sum); + sum += v; } + int max_name_length = 15; + int max_dim_length = 15; + int max_str_length = 15; + printf("%-*.15s [0]=%.15g dim={%-*.15s} str={%-*.15s} [addr]=%lu\n", + max_name_length, src0->name, + sum, + max_dim_length, ggml_ne_string(src0).c_str(), + max_str_length, ggml_nb_string(src0).c_str(), + src0->data); } /** @@ -96,18 +105,18 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (ask) { return true; // Always retrieve data } - char src1_str[128] = {0}; + if (src0) { + print_tensor(src0); + } if (src1) { - sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); + print_tensor(src1); } - - printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - + printf("%s ==\n", ggml_op_desc(t)); + if (t) { + print_tensor(t); + } + printf("\n\n"); // copy the data from the GPU memory if needed const bool is_host = ggml_backend_buffer_is_host(t->buffer); @@ -120,7 +129,6 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { if (!ggml_is_quantized(t->type)) { uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); - ggml_print_tensor(data, t->type, t->ne, t->nb, 3); } return true; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6d597bfd9d621..b6442bdb17d52 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -138,6 +138,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + OPENELM = auto() class MODEL_TENSOR(IntEnum): @@ -215,6 +216,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OPENELM: "openelm", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -725,6 +727,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OPENELM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_NORM, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index e5750d4191f6b..a83eb6a78289a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -24,6 +24,7 @@ class TensorNameMap: "backbone.embedding", # mamba "backbone.embeddings", # mamba-hf "transformer.in_out_embed", # Grok + "transformer.token_embeddings.weight", # openelm ), # Token type embeddings @@ -36,6 +37,7 @@ class TensorNameMap: "word_embeddings_layernorm", # bloom "embeddings.LayerNorm", # bert "emb_ln", # nomic-bert + "transformer.norm.weight", # openelm ), # Position embeddings @@ -68,6 +70,7 @@ class TensorNameMap: "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba "transformer.rms_norm", # Grok + "transformer.norm.weight" # openelm ), # Rope frequencies @@ -97,6 +100,7 @@ class TensorNameMap: "backbone.layers.{bid}.norm", # mamba "transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx + "transformer.layers.{bid}.attn_norm.weight" # openelm ), # Attention norm 2 @@ -117,7 +121,8 @@ class TensorNameMap: "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 "encoder.layers.{bid}.attn.Wqkv", # nomic-bert - "model.layers.{bid}.self_attn.qkv_proj" # phi3 + "model.layers.{bid}.self_attn.qkv_proj", # phi3 + "transformer.layers.{bid}.attn.qkv_proj.weight" # openelm ), # Attention query @@ -128,7 +133,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.q_proj", # gpt-j "model.layers.layers.{bid}.self_attn.q_proj", # plamo "model.layers.{bid}.attention.wq", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok + "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok ), # Attention key @@ -139,7 +144,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.k_proj", # gpt-j "model.layers.layers.{bid}.self_attn.k_proj", # plamo "model.layers.{bid}.attention.wk", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok + "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok ), # Attention value @@ -150,7 +155,8 @@ class TensorNameMap: "transformer.h.{bid}.attn.v_proj", # gpt-j "model.layers.layers.{bid}.self_attn.v_proj", # plamo "model.layers.{bid}.attention.wv", # internlm2 - "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok + "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok + ), # Attention output @@ -173,6 +179,7 @@ class TensorNameMap: "encoder.layers.{bid}.attn.out_proj", # nomic-bert "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx + "transformer.layers.{bid}.attn.out_proj.weight" # openelm ), # Attention output norm @@ -204,6 +211,7 @@ class TensorNameMap: "h.{bid}.ln_2", # gpt2 "model.layers.{bid}.ffn_norm", # internlm2 "transformer.decoder_layer.{bid}.rms_norm_2", # Grok + "transformer.layers.{bid}.ffn_norm.weight", # openelm ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -240,6 +248,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert "model.layers.{bid}.mlp.c_fc", # starcoder2 + "transformer.layers.{bid}.ffn.proj_1.weight" # openelm ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -299,6 +308,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.w2", # internlm2 "encoder.layers.{bid}.mlp.fc2", # nomic-bert "model.layers.{bid}.mlp.c_proj", # starcoder2 + "transformer.layers.{bid}.ffn.proj_2.weight" # openelm ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -317,6 +327,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_norm", # cohere "transformer.blocks.{bid}.attn.q_ln", # sea-lion + "transformer.layers.{bid}.attn.q_norm.weight" # openelm ), MODEL_TENSOR.ATTN_K_NORM: ( @@ -324,6 +335,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_norm", # cohere "transformer.blocks.{bid}.attn.k_ln", # sea-lion + "transformer.layers.{bid}.attn.k_norm.weight" ), MODEL_TENSOR.ROPE_FREQS: ( diff --git a/llama.cpp b/llama.cpp index 72c10ffc202fc..30af4cdbeba5f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -225,6 +225,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_OPENELM, LLM_ARCH_UNKNOWN, }; @@ -261,6 +262,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1028,6 +1030,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_OPENELM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1775,8 +1792,10 @@ enum e_model { MODEL_22M, MODEL_33M, MODEL_109M, + MODEL_270M, MODEL_137M, MODEL_335M, + MODEL_450M, MODEL_0_5B, MODEL_1B, MODEL_2B, @@ -2416,8 +2435,8 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size*10); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size*10); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); @@ -4188,6 +4207,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OPENELM: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_270M; break; + case 20: model.type = e_model::MODEL_450M; break; + case 28: model.type = e_model::MODEL_1B; break; + case 36: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4675,6 +4705,22 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } } +float make_divisible( + double v, + int divisor = 8, + float min_value = 0.0 +) { + if (min_value == 0.0) { + min_value = divisor; + } + float rounded_v = int((v + divisor / 2) / divisor) * divisor; + float new_v = (min_value > rounded_v)? min_value : rounded_v; + if (new_v < 0.9 * v) { + new_v += divisor; + } + return new_v; +} + // Returns false if cancelled by progress_callback static bool llm_load_tensors( llama_model_loader & ml, @@ -5924,6 +5970,50 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_OPENELM: + { + std::vector num_kv_heads = {3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5}; + std::vector num_query_heads = {12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20}; + std::vector ffn_multipliers = {0.5, 0.73, 0.97, 1.2, 1.43, 1.67, 1.9, 2.13, 2.37, 2.6, 2.83, 3.07, 3.3, 3.53, 3.77, 4.0}; + llama_hparams modified_hparams(hparams); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + for (int i = 0; i < n_layer; ++i) { + const int64_t n_head_k = num_kv_heads[i]; + const int64_t n_head_v = num_kv_heads[i]; + const int64_t n_head_kv = n_head_k + n_head_v; + const int64_t n_head = n_head_kv + num_query_heads[i]; + // const int64_t n_kv = (num_kv_heads[i]+num_kv_heads[i])*n_embd_head; + modified_hparams.n_head = n_head; + modified_hparams.n_embd_head_v = 64; + modified_hparams.n_embd_head_k = 64; + int64_t n_embd_head = modified_hparams.n_embd_head_v; + + modified_hparams.n_head_kv = n_head_kv; + const int64_t n_embd_gqa = n_embd_head * n_head; + const int64_t n_embd_k_gqa = modified_hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = modified_hparams.n_embd_v_gqa(); + const int64_t ffn_inter = make_divisible(n_embd*ffn_multipliers[i], 256); + + + ggml_context* ctx_layer = ctx_for_layer(i); + ggml_context* ctx_split = ctx_for_layer_split(i); + auto& layer = model.layers[i]; + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head }); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd_head*n_head }); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head_kv*n_embd_head*2, n_embd }); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * ffn_inter }); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_inter, n_embd }); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -6154,6 +6244,7 @@ using llm_build_cb = std::functionne[0] / 2; + struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0)); + struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + + x0 = ggml_silu(ctx, x0); + cb(cur, "ffn_silu", il); + + cur = ggml_mul(ctx, x0, x1); + cb(cur, "ffn_mul", il); + } break; case LLM_FFN_GELU: { cur = ggml_gelu(ctx, cur); @@ -6545,6 +6649,8 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); + // assert(n_kv <= n_tokens); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); cb(kqv, "kqv", il); @@ -10610,6 +10716,151 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_openelm() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = 64; + // TODO: get this from config + std::vector num_kv_heads = {3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5}; + std::vector num_query_heads = {12, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 20, 20, 20, 20}; + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + struct ggml_tensor * inp_pos = build_inp_pos(); + llama_hparams modified_hparams(hparams); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + for (int il = 0; il < n_layer; ++il) { + auto residual = inpL; + // TODO: Want the offsets to be calculated with the num heads at layer level + // This doesn't work at the moment, comment out to test + const int64_t n_head_k = num_kv_heads[il]; + const int64_t n_head_v = num_kv_heads[il]; + const int64_t n_head_q = num_query_heads[il]; + int64_t n_head_kv = n_head_k+n_head_v; + const int64_t n_head = n_head_kv+ num_query_heads[il]; + // const int64_t n_kv = (num_kv_heads[il]+num_kv_heads[il])*n_embd_head; // This makes asserts fail + modified_hparams.n_head = n_head; + modified_hparams.n_head = 4*n_head_k; // somehow this works. Some places expect this to be groups*n_head_kv insteal of n_head. maybe this is the defintiion somewhere. + modified_hparams.n_head_kv = num_query_heads[il]; + modified_hparams.n_embd_head_v = 64; + modified_hparams.n_embd_head_k = 64; + modified_hparams.n_embd = 64*n_head; + n_head_kv = modified_hparams.n_head_kv; + const int64_t n_embd_gqa = n_embd_head * n_head; + struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm; + cb(attn_q_norm, "attn_q_norm", il); + struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm; + cb(attn_k_norm, "attn_k_norm", il); + + // self-attention + { + + cb(model.layers[il].attn_norm, "attn_norm.weight", il); + struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, modified_hparams, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, cb, il); + cb(attn_norm_output, "attn_norm", il); + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + cb(model.layers[il].wqkv, "qkv_proj_weight", il); + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); // model.layers[il].wqkv -> might not be all 3 qkv + cb(cur, "qkv", il); + cur = ggml_reshape_3d(ctx0, cur, n_embd_head, n_tokens, n_head); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2]*num_query_heads[il], 0)); + cb(Qcur, "queries", il); + Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_k, cur->nb[1], cur->nb[2]*n_head_k, cur->nb[2]*num_query_heads[il])); + cb(Kcur, "keys", il); + Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_v, cur->nb[1], cur->nb[2]*n_head_v, cur->nb[2]*(num_query_heads[il]+n_head_k))); + cb(Vcur, "values", il); + // Q/K Layernorm + Qcur = llm_build_norm(ctx0, Qcur, modified_hparams, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM_RMS, cb, il); + Kcur = llm_build_norm(ctx0, Kcur, modified_hparams, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "keys", il); + Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3)); + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "queries", il); + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + + int64_t nev[GGML_MAX_DIMS] = {Vcur->ne[0], Vcur->ne[1], 4*Vcur->ne[2], Vcur->ne[3]}; + struct ggml_tensor * Vcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nev); + Vcur = ggml_repeat(ctx0, Vcur, Vcur2); + // Vcur = Vcur2; + cb(Vcur, "values", il); + + int64_t nek[GGML_MAX_DIMS] = {Kcur->ne[0], Kcur->ne[1], 4*Kcur->ne[2], Kcur->ne[3]}; + struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Kcur->type, GGML_MAX_DIMS, nek); + Kcur = ggml_repeat(ctx0, Kcur, Kcur2); + // Kcur = Kcur2; + cb(Kcur, "keys", il); + + Vcur = ggml_reshape_2d(ctx0, Vcur, 4*modified_hparams.n_embd_head_v*n_head_v, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, modified_hparams.n_embd_head_v, n_head_q, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, 4*modified_hparams.n_embd_head_v, n_head_k, n_tokens); + cur = llm_build_kv(ctx0, model, modified_hparams, kv_self, gf, + model.layers[il].wo, nullptr, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, n_head_kv, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor* inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + cur = ggml_add(ctx0, cur, residual); + residual = cur; + { + + cur = llm_build_norm(ctx0, cur, modified_hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // Need to figure this out now + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU2, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + } + cur = ggml_add(ctx0, residual, cur); + cb(cur, "l_out", il); + inpL = cur; + } + + cur = llm_build_norm(ctx0, cur, modified_hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -10823,6 +11074,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_OPENELM: + { + result = llm.build_openelm(); + } break; default: GGML_ASSERT(false); } @@ -15166,7 +15421,7 @@ struct llama_context_params llama_context_default_params() { /*.type_v =*/ GGML_TYPE_F16, /*.logits_all =*/ false, /*.embeddings =*/ false, - /*.offload_kqv =*/ true, + /*.offload_kqv =*/ false, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -15332,7 +15587,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.defrag_thold = params.defrag_thold; cparams.embeddings = params.embeddings; - cparams.offload_kqv = params.offload_kqv; + cparams.offload_kqv = false; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; @@ -15679,6 +15934,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: case LLM_ARCH_OLMO: + case LLM_ARCH_OPENELM: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/split_test.cpp b/split_test.cpp new file mode 100644 index 0000000000000..58a6ce0779166 --- /dev/null +++ b/split_test.cpp @@ -0,0 +1,40 @@ +#include +#include "ggml.h" + +int main() { + printf("split_test\n"); + // Initialization + struct ggml_init_params params = ggml_init_params{1024}; // Assuming this initializes memory + ggml_context *ctx = ggml_init(params); + + // Tensor Creation (Analogous to the PyTorch code) + int64_t size = 18 * 7 * 64; + int64_t dims[4] = {1, 18, 7, 64}; + ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims); + + // Initialize tensor data (Note: Simplified for this example) + float* tensor_data = (float*) tensor->data; + for (int i = 0; i < size; i++) { + tensor_data[i] = (float) i; + printf("%f", tensor_data[i]); + } + printf("\n"); + + // Reshaping and Transpose + // ... (You'll need ggml equivalents of reshape and transpose) + + // Splitting (We'll focus on this part) + int64_t num_q_heads = 12; + int64_t num_k_heads = 3; + int64_t num_v_heads = 3; + + ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6); + ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6); + ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6); + + // Accessing elements (assuming ggml provides similar access) + float *a_data = (float*) a->data; + std::cout << a_data[0] << std::endl; + + return 0; +} diff --git a/tests/test-split.cpp b/tests/test-split.cpp new file mode 100644 index 0000000000000..5122757cf47f5 --- /dev/null +++ b/tests/test-split.cpp @@ -0,0 +1,271 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include + +//static const std::map> & k_tests() { +// static std::map> _k_tests = { +// { "" , { }, }, +// { " " , { 220, }, }, +// { " " , { 256, }, }, +// { " " , { 262, }, }, +// { "\t" , { 197, }, }, +// { "\n" , { 198, }, }, +// { "\n\n" , { 271, }, }, +// { "\n\n\n" , { 1432, }, }, +// { "\t\n" , { 1602, }, }, +// { "Hello world" , { 9906, 1917, }, }, +// { " Hello world" , { 22691, 1917, }, }, +// { "Hello World" , { 9906, 4435, }, }, +// { " Hello World" , { 22691, 4435, }, }, +// { " Hello World!" , { 22691, 4435, 0, }, }, +// { "Hello, world!" , { 9906, 11, 1917, 0, }, }, +// { " Hello, world!" , { 22691, 11, 1917, 0, }, }, +// { " this is πŸ¦™.cpp" , { 420, 374, 11410, 99, 247, 13, 11055, }, }, +// { "w048 7tuijk dsdfhu" , { 86, 23904, 220, 22, 83, 2005, 42908, 11729, 3013, 17156, }, }, +// { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 79862, 102118, 13373, 64571, 34694, 3114, 112203, 80112, }, }, +// { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 21549, 222, 98629, 241, 45358, 233, 21549, 237, 45358, 224, 21549, 244, 21549, 115, 21549, 253, 45358, 223, 21549, 253, 21549, 95, 98629, 227, 21549, 223, 21549, 249, 21549, 227, 45358, 223, 21549, 231, }, }, +// { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 9468, 248, 222, 320, 8416, 8, 27623, 114, 102470, 9468, 234, 104, 31643, 320, 36773, 100166, 98634, 8, 26602, 227, 320, 3323, 43465, 430, 706, 1202, 1866, 4037, 8, }, }, +// { "Hello" , { 9906, }, }, +// { " Hello" , { 22691, }, }, +// { " Hello" , { 220, 22691, }, }, +// { " Hello" , { 256, 22691, }, }, +// { " Hello" , { 262, 22691, }, }, +// { " Hello\n Hello" , { 262, 22691, 198, 262, 22691, }, }, +// { " (" , { 320, }, }, +// { "\n =" , { 198, 284, }, }, +// { "' era" , { 6, 11639, }, }, +// { "Hello, y'all! How are you 😁 ?ζˆ‘ζƒ³εœ¨appleε·₯作1314151倩~", { 9906, 11, 379, 65948, 0, 2650, 527, 499, 27623, 223, 949, 37046, 101067, 19000, 23182, 102301, 9263, 18136, 16, 36827, 21909, }, }, +// { "3" , { 18, }, }, +// { "33" , { 1644, }, }, +// { "333" , { 8765, }, }, +// { "3333" , { 8765, 18, }, }, +// { "33333" , { 8765, 1644, }, }, +// { "333333" , { 8765, 8765, }, }, +// { "3333333" , { 8765, 8765, 18, }, }, +// { "33333333" , { 8765, 8765, 1644, }, }, +// { "333333333" , { 8765, 8765, 8765, }, }, +// }; +// +// return _k_tests; +//} + +static std::map> read_tests(const std::string & fname_inp, const std::string & fname_out) { + std::map> tests; + + std::ifstream ifs_inp(fname_inp); + if (!ifs_inp) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str()); + return tests; + } + + std::string sraw((std::istreambuf_iterator(ifs_inp)), std::istreambuf_iterator()); + + std::ifstream ifs_out(fname_out); + if (!ifs_out) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return tests; + } + + std::vector sout; + for (std::string line; std::getline(ifs_out, line);) { + sout.push_back(line); + } + + const std::string sep = "\n__ggml_vocab_test__\n"; + + std::vector sinp; + + size_t pos = 0; + while (pos < sraw.size()) { + const size_t next = sraw.find(sep, pos); + if (next == std::string::npos) { + sinp.push_back(sraw.substr(pos)); + break; + } + sinp.push_back(sraw.substr(pos, next - pos)); + pos = next + sep.size(); + } + + if (sinp.size() != sout.size()) { + fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__); + return tests; + } + + for (size_t i = 0; i < sinp.size(); ++i) { + const std::string & s = sinp[i]; + const std::string & o = string_strip(sout[i]); + + std::vector toks; + + size_t pos = 0; + while (pos < o.size()) { + size_t next = o.find(' ', pos); + if (next == std::string::npos) { + next = o.size(); + } + const std::string stok = o.substr(pos, next - pos); + toks.push_back(std::stoi(stok)); + pos = next + 1; + } + + tests[s] = toks; + } + + return tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + const std::string fname_inp = fname + ".inp"; + const std::string fname_out = fname + ".out"; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(); + + // load the vocab + { + auto mparams = llama_model_default_params(); + + mparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), mparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + auto cparams = llama_context_default_params(); + + ctx = llama_new_context_with_model(model, cparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + bool success = true; + + const auto k_tests = read_tests(fname_inp, fname_out); + + if (k_tests.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + return 1; + } + + const bool add_special = false; + + for (const auto & test_kv : k_tests) { + const std::vector res = llama_tokenize(ctx, test_kv.first, add_special); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, add_special); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + } + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + printf("\n"); + printf("Tests %s\n", success ? "passed" : "failed"); + + return success ? 0 : 3; +}