ggml-org · mxyng · Sep 16, 2024 · compilade · Oct 6, 2024 · compilade
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -4079,6 +4079,25 @@ def prepare_tensors(self):
 
         super().prepare_tensors()
 
+@Model.register("SolarForCausalLM")
+class SolarModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.SOLAR
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        for i, bskcn in enumerate(self.hparams[k] for k in self.hparams.keys() if k.startswith("bskcn_") and k != 'bskcn_tv'):
+            # store the skip connections as a layer index where a non-zero value indicates a skip connection
+            # this approach simplifies lookup at inference time
+            self.gguf_writer.add_block_skip_connection(i, [1 if n in bskcn else 0 for n in range(self.block_count)])
+
+    def prepare_tensors(self):
+        if bskcn_tv := self.find_hparam(['bskcn_tv'], optional=True):
+          # use bskcn_tv[1] for inference since bskcn_tv[0] is for training
+          self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.BSKCN_TV), np.array([bskcn_tv[1], 1 - bskcn_tv[1]], dtype=np.float32))
+
+        super().prepare_tensors()
+
 
 @Model.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -101,20 +101,21 @@ class LLM:
         EMBEDDING_SCALE                   = "{arch}.embedding_scale"
 
     class Attention:
-        HEAD_COUNT        = "{arch}.attention.head_count"
-        HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
-        MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
-        CLAMP_KQV         = "{arch}.attention.clamp_kqv"
-        KEY_LENGTH        = "{arch}.attention.key_length"
-        VALUE_LENGTH      = "{arch}.attention.value_length"
-        LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
-        LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
-        CAUSAL            = "{arch}.attention.causal"
-        Q_LORA_RANK       = "{arch}.attention.q_lora_rank"
-        KV_LORA_RANK      = "{arch}.attention.kv_lora_rank"
-        REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
-        SLIDING_WINDOW    = "{arch}.attention.sliding_window"
-        SCALE             = "{arch}.attention.scale"
+        HEAD_COUNT            = "{arch}.attention.head_count"
+        HEAD_COUNT_KV         = "{arch}.attention.head_count_kv"
+        MAX_ALIBI_BIAS        = "{arch}.attention.max_alibi_bias"
+        CLAMP_KQV             = "{arch}.attention.clamp_kqv"
+        KEY_LENGTH            = "{arch}.attention.key_length"
+        VALUE_LENGTH          = "{arch}.attention.value_length"
+        LAYERNORM_EPS         = "{arch}.attention.layer_norm_epsilon"
+        LAYERNORM_RMS_EPS     = "{arch}.attention.layer_norm_rms_epsilon"
+        CAUSAL                = "{arch}.attention.causal"
+        Q_LORA_RANK           = "{arch}.attention.q_lora_rank"
+        KV_LORA_RANK          = "{arch}.attention.kv_lora_rank"
+        REL_BUCKETS_COUNT     = "{arch}.attention.relative_buckets_count"
+        SLIDING_WINDOW        = "{arch}.attention.sliding_window"
+        SCALE                 = "{arch}.attention.scale"
+        BLOCK_SKIP_CONNECTION = "{arch}.attention.block_skip_connection.{n}"
 
     class Rope:
         DIMENSION_COUNT         = "{arch}.rope.dimension_count"
@@ -235,6 +236,7 @@ class MODEL_ARCH(IntEnum):
     NEMOTRON     = auto()
     EXAONE       = auto()
     GRANITE      = auto()
+    SOLAR        = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -342,6 +344,7 @@ class MODEL_TENSOR(IntEnum):
     ENC_FFN_DOWN         = auto()
     ENC_FFN_UP           = auto()
     ENC_OUTPUT_NORM      = auto()
+    BSKCN_TV             = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -392,6 +395,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.NEMOTRON:       "nemotron",
     MODEL_ARCH.EXAONE:         "exaone",
     MODEL_ARCH.GRANITE:        "granite",
+    MODEL_ARCH.SOLAR:          "solar",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -499,6 +503,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ENC_FFN_DOWN:              "enc.blk.{bid}.ffn_down",
     MODEL_TENSOR.ENC_FFN_UP:                "enc.blk.{bid}.ffn_up",
     MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
+    MODEL_TENSOR.BSKCN_TV:                  "bskcn_tv",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -521,6 +526,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE_EXP,
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.BSKCN_TV,
     ],
     MODEL_ARCH.GROK: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -1242,6 +1248,21 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.SOLAR: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.BSKCN_TV,
+    ],
     # TODO
 }
 

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -712,6 +712,9 @@ def add_sliding_window(self, value: int) -> None:
     def add_attention_scale(self, value: float) -> None:
         self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
 
+    def add_block_skip_connection(self, n: int, value: list[int]) -> None:
+        self.add_array(Keys.Attention.BLOCK_SKIP_CONNECTION.format(arch=self.arch, n=n), value)
+
     def add_pooling_type(self, value: PoolingType) -> None:
         self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)