diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 4b4c129b0..5b84f6753 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -76,6 +76,72 @@ jobs:
name: llava-bin-linux-${{ matrix.build }}-x64.so
if-no-files-found: error
+ compile-musl:
+ name: Compile (musl)
+ strategy:
+ fail-fast: true
+ matrix:
+ include:
+ - build: 'noavx'
+ defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+ - build: 'avx2'
+ defines: ''
+ - build: 'avx'
+ defines: '-DGGML_AVX2=OFF'
+ - build: 'avx512'
+ defines: '-DGGML_AVX512=ON'
+ runs-on: ubuntu-20.04
+ container:
+ image: alpine:latest
+ steps:
+ - name: Install dependencies
+ run: |
+ apk update && apk add --no-cache \
+ build-base \
+ cmake \
+ git \
+ linux-headers \
+ g++
+ - uses: actions/checkout@v4
+ with:
+ repository: ggerganov/llama.cpp
+ fetch-depth: 0
+ ref: '${{ github.event.inputs.llama_cpp_commit }}'
+ - name: Build
+ id: cmake_build_musl
+ run: |
+ mkdir build
+ cd build
+ cmake .. ${{ env.COMMON_DEFINE }} ${{ matrix.defines }}
+ cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+ ls -R
+ - uses: actions/upload-artifact@v4
+ with:
+ path: ./build/bin/libllama.so
+ name: llama-bin-musl-${{ matrix.build }}-x64.so
+ if-no-files-found: error
+ - uses: actions/upload-artifact@v4
+ with:
+ path: ./build/bin/libggml.so
+ name: ggml-bin-musl-${{ matrix.build }}-x64.so
+ if-no-files-found: error
+ - uses: actions/upload-artifact@v4
+ with:
+ path: ./build/bin/libggml-base.so
+ name: ggml-base-bin-musl-${{ matrix.build }}-x64.so
+ if-no-files-found: error
+ - uses: actions/upload-artifact@v4
+ with:
+ path: ./build/bin/libggml-cpu.so
+ name: ggml-cpu-bin-musl-${{ matrix.build }}-x64.so
+ if-no-files-found: error
+ - name: Upload Llava
+ uses: actions/upload-artifact@v4
+ with:
+ path: ./build/bin/libllava_shared.so
+ name: llava-bin-musl-${{ matrix.build }}-x64.so
+ if-no-files-found: error
+
compile-windows:
name: Compile (Windows)
strategy:
@@ -519,6 +585,7 @@ jobs:
if: ${{ always() }}
needs: [
"compile-linux",
+ "compile-musl",
"compile-windows",
"compile-vulkan",
"compile-cublas",
@@ -534,7 +601,7 @@ jobs:
- name: Rearrange Files
run: |
# Make all directories at once
- mkdir --parents deps/{noavx,avx,avx2,avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64}
+ mkdir --parents deps/{noavx,avx,avx2,avx512,musl-noavx,musl-avx,musl-avx2,musl-avx512,osx-arm64,osx-x64,osx-x64-rosetta2,cu11.7.1,cu12.2.0,vulkan,android-arm64-v8a,android-x86,android-x86_64}
# Linux
cp artifacts/ggml-bin-linux-noavx-x64.so/libggml.so deps/noavx/libggml.so
@@ -561,6 +628,31 @@ jobs:
cp artifacts/llama-bin-linux-avx512-x64.so/libllama.so deps/avx512/libllama.so
cp artifacts/llava-bin-linux-avx512-x64.so/libllava_shared.so deps/avx512/libllava_shared.so
+ # Musl
+ cp artifacts/ggml-bin-musl-noavx-x64.so/libggml.so deps/musl-noavx/libggml.so
+ cp artifacts/ggml-base-bin-musl-noavx-x64.so/libggml-base.so deps/musl-noavx/libggml-base.so
+ cp artifacts/ggml-cpu-bin-musl-noavx-x64.so/libggml-cpu.so deps/musl-noavx/libggml-cpu.so
+ cp artifacts/llama-bin-musl-noavx-x64.so/libllama.so deps/musl-noavx/libllama.so
+ cp artifacts/llava-bin-musl-noavx-x64.so/libllava_shared.so deps/musl-noavx/libllava_shared.so
+
+ cp artifacts/ggml-bin-musl-avx-x64.so/libggml.so deps/musl-avx/libggml.so
+ cp artifacts/ggml-base-bin-musl-avx-x64.so/libggml-base.so deps/musl-avx/libggml-base.so
+ cp artifacts/ggml-cpu-bin-musl-avx-x64.so/libggml-cpu.so deps/musl-avx/libggml-cpu.so
+ cp artifacts/llama-bin-musl-avx-x64.so/libllama.so deps/musl-avx/libllama.so
+ cp artifacts/llava-bin-musl-avx-x64.so/libllava_shared.so deps/musl-avx/libllava_shared.so
+
+ cp artifacts/ggml-bin-musl-avx2-x64.so/libggml.so deps/musl-avx2/libggml.so
+ cp artifacts/ggml-base-bin-musl-avx2-x64.so/libggml-base.so deps/musl-avx2/libggml-base.so
+ cp artifacts/ggml-cpu-bin-musl-avx2-x64.so/libggml-cpu.so deps/musl-avx2/libggml-cpu.so
+ cp artifacts/llama-bin-musl-avx2-x64.so/libllama.so deps/musl-avx2/libllama.so
+ cp artifacts/llava-bin-musl-avx2-x64.so/libllava_shared.so deps/musl-avx2/libllava_shared.so
+
+ cp artifacts/ggml-bin-musl-avx512-x64.so/libggml.so deps/musl-avx512/libggml.so
+ cp artifacts/ggml-base-bin-musl-avx512-x64.so/libggml-base.so deps/musl-avx512/libggml-base.so
+ cp artifacts/ggml-cpu-bin-musl-avx512-x64.so/libggml-cpu.so deps/musl-avx512/libggml-cpu.so
+ cp artifacts/llama-bin-musl-avx512-x64.so/libllama.so deps/musl-avx512/libllama.so
+ cp artifacts/llava-bin-musl-avx512-x64.so/libllava_shared.so deps/musl-avx512/libllava_shared.so
+
# Windows
cp artifacts/ggml-bin-win-noavx-x64.dll/ggml.dll deps/noavx/ggml.dll
cp artifacts/ggml-base-bin-win-noavx-x64.dll/ggml-base.dll deps/noavx/ggml-base.dll
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 6efd44f7b..041a2cf88 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -33,9 +33,10 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
{
ContextSize = config.ContextSize,
GpuLayerCount = config.GpuLayerCount ?? 20,
- Embeddings = true,
+
PoolingType = LLamaPoolingType.Mean,
};
+
_weights = LLamaWeights.LoadFromFile(@params);
_embedder = new LLamaEmbedder(_weights, @params);
_ownsWeights = true;
diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
index a12ad04ee..5273215aa 100644
--- a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
+++ b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
@@ -22,7 +22,7 @@ public ITextTokenizerTests(ITestOutputHelper testOutputHelper)
_testOutputHelper = testOutputHelper;
_infParams = new() { AntiPrompts = ["\n\n"] };
- _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams };
+ _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams, ContextSize = 512 };
testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}");
}
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
index 82157a17f..e28b55ce0 100644
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -14,6 +14,10 @@ public LLamaContextTests()
var @params = new ModelParams(Constants.GenerativeModelPath2)
{
ContextSize = 128,
+ BatchSize = 8,
+ UBatchSize = 8,
+ SeqMax = 1,
+ VocabOnly = false,
GpuLayerCount = Constants.CIGpuLayerCount,
};
_weights = LLamaWeights.LoadFromFile(@params);
@@ -84,6 +88,11 @@ public void TokenizeEmpty()
[Fact]
public void SaveLoadState()
{
+ // Make sure there's something in the context worth saving
+ var batch = new LLamaBatch();
+ batch.Add(17, 0, LLamaSeqId.Zero, true);
+ _context.Decode(batch);
+
using var state1 = _context.GetState();
var stream = new MemoryStream();
@@ -99,6 +108,11 @@ public void SaveLoadState()
[Fact]
public async Task SaveLoadStateAsync()
{
+ // Make sure there's something in the context worth saving
+ var batch = new LLamaBatch();
+ batch.Add(17, 0, LLamaSeqId.Zero, true);
+ _context.Decode(batch);
+
using var state1 = _context.GetState();
var stream = new MemoryStream();
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
index c9a374549..8b7ff2694 100644
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@@ -128,7 +128,7 @@ public Conversation Fork()
_forked = true;
// Assign tokens to the new sequence
- NativeApi.llama_kv_cache_seq_cp(Executor.Context.NativeHandle, ConversationId, c.ConversationId, 0, _end);
+ Executor.Context.NativeHandle.KvCacheSequenceCopy(ConversationId, c.ConversationId, 0, _end);
return c;
}
diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
index 48060dbaf..995cb3e4e 100644
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@@ -193,8 +193,8 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
int n_left = _pastTokensCount - tokensToKeep;
int n_discard = n_left / 2;
- NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
- NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
+ NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep, tokensToKeep + n_discard);
+ NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
_pastTokensCount -= n_discard;
// stop saving session if we run out of context
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index 76292aaf5..22a3e04e1 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -200,6 +200,7 @@
PreserveNewest
runtimes/linux-x64/native/avx512/libggml-cpu.so
+
PreserveNewest
@@ -253,6 +254,75 @@
PreserveNewest
runtimes/linux-x64/native/vulkan/libggml-vulkan.so
+
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/noavx/libllama.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/noavx/libggml.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/noavx/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/noavx/libggml-cpu.so
+
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx/libllama.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx/libggml.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx/libggml-cpu.so
+
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx2/libllama.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx2/libggml.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx2/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx2/libggml-cpu.so
+
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx512/libllama.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx512/libggml.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx512/libggml-base.so
+
+
+ PreserveNewest
+ runtimes/linux-musl-x64/native/avx512/libggml-cpu.so
+
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index e960a414e..618a902e6 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -56,7 +56,7 @@
- 5783575c9d99
+ be7c3034108473be
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index 43b20b15d..8aa705062 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -155,8 +155,8 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams
var n_left = n_past - tokensKeep;
var n_discard = n_left / 2;
- NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
- NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
+ NativeApi.llama_kv_self_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
+ NativeApi.llama_kv_self_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
n_past -= n_discard;
}
diff --git a/LLama/Native/LLamaKvCache.cs b/LLama/Native/LLamaKvCache.cs
new file mode 100644
index 000000000..4a402f9ed
--- /dev/null
+++ b/LLama/Native/LLamaKvCache.cs
@@ -0,0 +1,10 @@
+namespace LLama.Native;
+
+///
+/// C# representation of llama_kv_cache
+///
+/// llama_kv_cache
+internal struct LLamaKvCacheNative
+{
+
+}
\ No newline at end of file
diff --git a/LLama/Native/LLamaVocabPreType.cs b/LLama/Native/LLamaVocabPreType.cs
index 7b84783df..384ba0391 100644
--- a/LLama/Native/LLamaVocabPreType.cs
+++ b/LLama/Native/LLamaVocabPreType.cs
@@ -4,6 +4,7 @@ namespace LLama.Native;
///
///
/// llama_vocab_pre_type
+// ReSharper disable InconsistentNaming
internal enum LLamaVocabPreType
{
Default = 0,
@@ -36,4 +37,6 @@ internal enum LLamaVocabPreType
CHAMELEON = 26,
MINERVA = 27,
DEEPSEEK3_LLM = 28,
-}
\ No newline at end of file
+ GPT4O = 29,
+}
+// ReSharper restore InconsistentNaming
\ No newline at end of file
diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs
index 13e68be4d..b0e8a792a 100644
--- a/LLama/Native/Load/NativeLibraryUtils.cs
+++ b/LLama/Native/Load/NativeLibraryUtils.cs
@@ -218,10 +218,22 @@ public static void GetPlatformPathParts(OSPlatform platform, out string os, out
if (platform == OSPlatform.Linux)
{
- os = "linux-x64";
- fileExtension = ".so";
- libPrefix = "lib";
- return;
+ if(RuntimeInformation.RuntimeIdentifier.ToLower().StartsWith("alpine"))
+ {
+ // alpine linux distro
+ os = "linux-musl-x64";
+ fileExtension = ".so";
+ libPrefix = "lib";
+ return;
+ }
+ else
+ {
+ // other linux distro
+ os = "linux-x64";
+ fileExtension = ".so";
+ libPrefix = "lib";
+ return;
+ }
}
if (platform == OSPlatform.OSX)
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index f17a1bd5e..4c788b7a0 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -273,7 +273,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern int llama_get_kv_cache_token_count(SafeLLamaContextHandle ctx);
+ internal static extern int llama_kv_self_n_tokens(SafeLLamaContextHandle ctx);
///
/// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
@@ -281,14 +281,14 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern int llama_get_kv_cache_used_cells(SafeLLamaContextHandle ctx);
+ internal static extern int llama_kv_self_used_cells(SafeLLamaContextHandle ctx);
///
/// Clear the KV cache. Both cell info is erased and KV data is zeroed
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_kv_cache_clear(SafeLLamaContextHandle ctx);
+ internal static extern void llama_kv_self_clear(SafeLLamaContextHandle ctx);
///
/// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -300,7 +300,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
/// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.U1)]
- public static extern bool llama_kv_cache_seq_rm(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1);
+ public static extern bool llama_kv_self_seq_rm(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1);
///
/// Copy all tokens that belong to the specified sequence to another sequence
@@ -312,7 +312,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_kv_cache_seq_cp(SafeLLamaContextHandle ctx, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1);
+ internal static extern void llama_kv_self_seq_cp(SafeLLamaContextHandle ctx, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1);
///
/// Removes all tokens that do not belong to the specified sequence
@@ -320,13 +320,13 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_kv_cache_seq_keep(SafeLLamaContextHandle ctx, LLamaSeqId seq);
+ internal static extern void llama_kv_self_seq_keep(SafeLLamaContextHandle ctx, LLamaSeqId seq);
///
/// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
/// If the KV cache is RoPEd, the KV data is updated accordingly:
/// - lazily on next llama_decode()
- /// - explicitly with llama_kv_cache_update()
+ /// - explicitly with llama_kv_self_update()
///
///
///
@@ -334,13 +334,13 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_kv_cache_seq_add(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
+ internal static extern void llama_kv_self_seq_add(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
///
/// Integer division of the positions by factor of `d > 1`
/// If the KV cache is RoPEd, the KV data is updated accordingly:
/// - lazily on next llama_decode()
- /// - explicitly with llama_kv_cache_update()
+ /// - explicitly with llama_kv_self_update()
///
/// p0 < 0 : [0, p1]
///
@@ -352,7 +352,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
+ internal static extern void llama_kv_self_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
///
/// Returns the largest position present in the KV cache for the specified sequence
@@ -361,7 +361,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern LLamaPos llama_kv_cache_seq_pos_max(SafeLLamaContextHandle ctx, LLamaSeqId seq);
+ internal static extern LLamaPos llama_kv_self_seq_pos_max(SafeLLamaContextHandle ctx, LLamaSeqId seq);
///
/// Allocates a batch of tokens on the heap
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index f472a1943..faa390f76 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -313,19 +313,19 @@ static SafeLLamaContextHandle()
///
/// Defragment the KV cache. This will be applied:
/// - lazily on next llama_decode()
- /// - explicitly with llama_kv_cache_update()
+ /// - explicitly with llama_kv_self_update()
///
///
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_kv_cache_defrag(SafeLLamaContextHandle ctx);
+ private static extern void llama_kv_self_defrag(SafeLLamaContextHandle ctx);
///
/// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
///
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx);
+ private static extern void llama_kv_self_update(SafeLLamaContextHandle ctx);
///
/// Check if the context supports KV cache shifting
@@ -333,7 +333,7 @@ static SafeLLamaContextHandle()
///
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern bool llama_kv_cache_can_shift(SafeLLamaContextHandle ctx);
+ private static extern bool llama_kv_self_can_shift(SafeLLamaContextHandle ctx);
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx);
@@ -386,6 +386,9 @@ static SafeLLamaContextHandle()
/// A pointer to the first float in an embedding, length = ctx.EmbeddingSize
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i);
+
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern LLamaKvCacheNative llama_get_kv_self(SafeLLamaContextHandle ctx);
#endregion
#region LoRA
@@ -751,25 +754,25 @@ public void ResetTimings()
///
/// Check if the context supports KV cache shifting
///
- public bool KvCacheCanShift => llama_kv_cache_can_shift(this);
+ public bool KvCacheCanShift => llama_kv_self_can_shift(this);
///
/// Apply KV cache updates (such as K-shifts, defragmentation, etc.)
///
public void KvCacheUpdate()
{
- llama_kv_cache_update(this);
+ llama_kv_self_update(this);
}
///
/// Defragment the KV cache. This will be applied:
/// - lazily on next llama_decode()
- /// - explicitly with llama_kv_cache_update()
+ /// - explicitly with llama_kv_self_update()
///
///
public void KvCacheDefrag()
{
- llama_kv_cache_defrag(this);
+ llama_kv_self_defrag(this);
}
///
@@ -788,7 +791,7 @@ public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences = 4)
///
public int KvCacheCountCells()
{
- return NativeApi.llama_get_kv_cache_used_cells(this);
+ return NativeApi.llama_kv_self_used_cells(this);
}
///
@@ -798,7 +801,7 @@ public int KvCacheCountCells()
///
public int KvCacheCountTokens()
{
- return NativeApi.llama_get_kv_cache_token_count(this);
+ return NativeApi.llama_kv_self_n_tokens(this);
}
///
@@ -806,7 +809,7 @@ public int KvCacheCountTokens()
///
public void KvCacheClear()
{
- NativeApi.llama_kv_cache_clear(this);
+ NativeApi.llama_kv_self_clear(this);
}
///
@@ -817,7 +820,7 @@ public void KvCacheClear()
///
public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
{
- NativeApi.llama_kv_cache_seq_rm(this, seq, p0, p1);
+ NativeApi.llama_kv_self_seq_rm(this, seq, p0, p1);
}
///
@@ -831,7 +834,7 @@ public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
///
public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
{
- NativeApi.llama_kv_cache_seq_cp(this, src, dest, p0, p1);
+ NativeApi.llama_kv_self_seq_cp(this, src, dest, p0, p1);
}
///
@@ -840,7 +843,7 @@ public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LL
///
public void KvCacheSequenceKeep(LLamaSeqId seq)
{
- NativeApi.llama_kv_cache_seq_keep(this, seq);
+ NativeApi.llama_kv_self_seq_keep(this, seq);
}
///
@@ -854,7 +857,10 @@ public void KvCacheSequenceKeep(LLamaSeqId seq)
///
public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
{
- NativeApi.llama_kv_cache_seq_add(this, seq, p0, p1, delta);
+ if (!KvCacheCanShift)
+ throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)");
+
+ NativeApi.llama_kv_self_seq_add(this, seq, p0, p1, delta);
}
///
@@ -869,7 +875,10 @@ public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int del
///
public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
{
- NativeApi.llama_kv_cache_seq_div(this, seq, p0, p1, divisor);
+ if (!KvCacheCanShift)
+ throw new InvalidOperationException("Cannot shift KV cache (KvCacheCanShift=False)");
+
+ NativeApi.llama_kv_self_seq_div(this, seq, p0, p1, divisor);
}
///
@@ -879,7 +888,7 @@ public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int
///
public LLamaPos KvCacheMaxPosition(LLamaSeqId seq)
{
- return NativeApi.llama_kv_cache_seq_pos_max(this, seq);
+ return NativeApi.llama_kv_self_seq_pos_max(this, seq);
}
#endregion
}
diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs
index 8f6c21ad1..8d6cd3015 100644
--- a/LLama/Native/SafeLLamaSamplerHandle.cs
+++ b/LLama/Native/SafeLLamaSamplerHandle.cs
@@ -279,6 +279,18 @@ public void AddTopK(int k)
static extern IntPtr llama_sampler_init_top_k(int k);
}
+ ///
+ /// Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
+ ///
+ ///
+ public void AddTopNSigma(float n)
+ {
+ llama_sampler_chain_add(this, llama_sampler_init_top_n_sigma(n));
+
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ static extern IntPtr llama_sampler_init_top_n_sigma(float n);
+ }
+
///
/// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
///
@@ -421,53 +433,37 @@ public void AddGrammar(SafeLlamaModelHandle model, string grammar, string root)
///
/// Grammar in GBNF form
/// Root rule of the grammar
- /// A list of tokens that will trigger the grammar sampler.
- /// A list of words that will trigger the grammar sampler.
+ /// A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+ /// A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included..
///
public void AddLazyGrammar(
SafeLlamaModelHandle model,
string grammar, string root,
- ReadOnlySpan triggerWords,
+ ReadOnlySpan patterns,
ReadOnlySpan triggerTokens)
{
unsafe
{
- // Convert strings, fix memory in place, build array of pointers
- var handles = new List();
- var triggerWordsPtrs = stackalloc byte*[triggerWords.Length];
- for (var i = 0; i < triggerWords.Length; i++)
- {
- var chars = Encoding.Default.GetBytes(triggerWords[i]);
- handles.Add(chars.AsMemory().Pin());
-
- triggerWordsPtrs[i] = (byte*)handles[i].Pointer;
- }
-
- fixed (LLamaToken* triggerTokensPtr = triggerTokens)
- {
- llama_sampler_chain_add(
- this,
- llama_sampler_init_grammar_lazy(
- model.Vocab.VocabNative,
- grammar, root,
- triggerWordsPtrs, (nuint)triggerWords.Length,
- triggerTokensPtr, (nuint)triggerTokens.Length
- )
- );
- }
-
- // Clear up all the handles fixing the memory in place
- for (var i = 0; i < handles.Count; i++)
- handles[i].Dispose();
+ llama_sampler_chain_add(
+ this,
+ llama_sampler_init_grammar_lazy_patterns(
+ model.Vocab.VocabNative,
+ grammar, root,
+ patterns.ToArray(),
+ triggerTokens.ToArray(), (nuint)triggerTokens.Length
+ )
+ );
}
// ReSharper disable InconsistentNaming
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- static extern unsafe IntPtr llama_sampler_init_grammar_lazy(
+ static extern unsafe IntPtr llama_sampler_init_grammar_lazy_patterns(
LLamaVocabNative* model,
string grammar_str, string grammar_root,
- byte** trigger_words, nuint num_trigger_words,
- LLamaToken* trigger_tokens, nuint num_trigger_tokens);
+ string[] triggerPatterns,
+ LLamaToken[] triggerTokens,
+ nuint num_trigger_tokens
+ );
// ReSharper restore InconsistentNaming
}
@@ -590,9 +586,12 @@ static extern unsafe IntPtr llama_sampler_init_logit_bias(
#endregion
#region Native API
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ internal static extern unsafe LLamaSamplerNative* llama_sampler_init(LLamaSamplerINative* iface, IntPtr ctx);
+
// ReSharper disable InconsistentNaming
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern void llama_sampler_free(IntPtr model);
+ private static extern void llama_sampler_free(IntPtr /* llama_sampler* */ sampler);
// important: this takes ownership of the sampler object and will free it when llama_sampler_free is called on the chain
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
@@ -757,8 +756,6 @@ private CustomSamplerHandle(ICustomSampler sampler)
public static CustomSamplerHandle Create(ICustomSampler sampler)
{
- var nameArr = Encoding.UTF8.GetBytes(sampler.Name + '\0');
-
var handle = new CustomSamplerHandle(sampler);
handle._gcHandle = GCHandle.Alloc(handle);
@@ -773,12 +770,14 @@ public static CustomSamplerHandle Create(ICustomSampler sampler)
handle._samplerNativeInterfacePtr->Clone = (delegate*)Marshal.GetFunctionPointerForDelegate(Clone);
handle._samplerNativeInterfacePtr->Free = (delegate*)Marshal.GetFunctionPointerForDelegate(Free);
- // Allocate space for a `LLamaSamplerNative` struct. So we can pass pointers to it.
- handle._samplerNativePtr = (LLamaSamplerNative*)Marshal.AllocHGlobal(sizeof(LLamaSamplerNative));
- handle._samplerNativePtr->Context = (IntPtr)handle._gcHandle;
- handle._samplerNativePtr->Interface = handle._samplerNativeInterfacePtr;
+ // Allocate `LLamaSamplerNative` struct.
+ handle._samplerNativePtr = SafeLLamaSamplerChainHandle.llama_sampler_init(
+ handle._samplerNativeInterfacePtr,
+ (IntPtr)handle._gcHandle
+ );
// Allocate space for the name string
+ var nameArr = Encoding.UTF8.GetBytes(sampler.Name + '\0');
handle._samplerNamePtr = (byte*)Marshal.AllocHGlobal(nameArr.Length);
nameArr.AsSpan().CopyTo(new Span(handle._samplerNamePtr, nameArr.Length));
}
@@ -832,12 +831,6 @@ private static unsafe void Free(ref LLamaSamplerNative smpl)
{
var sampler = GetSampler(ref smpl);
- if (sampler._samplerNativePtr != null)
- {
- Marshal.FreeHGlobal((IntPtr)sampler._samplerNativePtr);
- sampler._samplerNativePtr = null;
- }
-
if (sampler._samplerNativeInterfacePtr != null)
{
Marshal.FreeHGlobal((IntPtr)sampler._samplerNativeInterfacePtr);
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 0fd39176b..e0205da54 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -54,6 +54,11 @@ public sealed class SafeLlamaModelHandle
///
public int HeadCount => llama_model_n_head(this);
+ ///
+ /// Get the number of KV heads in this model
+ ///
+ public int KVHeadCount => llama_model_n_head(this);
+
///
/// Returns true if the model contains an encoder that requires llama_encode() call
///
@@ -310,6 +315,14 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_model_n_head(SafeLlamaModelHandle model);
+ ///
+ /// Get the number of KV heads in this model
+ ///
+ ///
+ ///
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern int llama_model_n_head_kv(SafeLlamaModelHandle model);
+
///
/// Get a string describing the model type
///
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
index debc99506..7c69534da 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -65,7 +65,31 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+