diff --git a/Cargo.lock b/Cargo.lock index 5d6c3bf8..d3101648 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -495,7 +495,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "candle-core" version = "0.8.4" -source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e" +source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539" dependencies = [ "accelerate-src", "byteorder", @@ -534,7 +534,7 @@ dependencies = [ [[package]] name = "candle-flash-attn" version = "0.8.4" -source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e" +source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539" dependencies = [ "anyhow", "bindgen_cuda", @@ -557,7 +557,7 @@ dependencies = [ [[package]] name = "candle-kernels" version = "0.8.4" -source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e" +source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539" dependencies = [ "bindgen_cuda", ] @@ -576,7 +576,7 @@ dependencies = [ [[package]] name = "candle-metal-kernels" version = "0.8.4" -source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e" +source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539" dependencies = [ "metal 0.27.0", "once_cell", @@ -587,7 +587,7 @@ dependencies = [ [[package]] name = "candle-nn" version = "0.8.4" -source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e" +source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539" dependencies = [ "accelerate-src", "candle-core", @@ -615,7 +615,7 @@ dependencies = [ [[package]] name = "candle-transformers" version = "0.8.4" -source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e" +source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539" dependencies = [ "byteorder", "candle-core", @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "cudarc" version = "0.13.5" -source = "git+https://github.com/Narsil/cudarc?rev=1956436aeddea1da04fc3226282bc07c07eeaa35#1956436aeddea1da04fc3226282bc07c07eeaa35" +source = "git+https://github.com/Narsil/cudarc?rev=18ae111a4e8779c11377636b9cc3379f686e99c6#18ae111a4e8779c11377636b9cc3379f686e99c6" dependencies = [ "half", "libloading", @@ -4550,6 +4550,7 @@ dependencies = [ "http 1.3.1", "init-tracing-opentelemetry", "insta", + "intel-mkl-src", "is_close", "libc", "metrics", diff --git a/Cargo.toml b/Cargo.toml index 8b07d18b..8001617c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ thiserror = "1.0" rand = "0.9" serial_test = "2.0.0" cudarc = { version = "0.13" , features =["cuda-12020"]} +intel-mkl-src = { version = "0.8" } candle = { version = "0.8", package = "candle-core" } candle-nn = { version = "0.8", package = "candle-nn" } candle-transformers = { version = "0.8", package = "candle-transformers" } @@ -54,11 +55,14 @@ candle-flash-attn = { version = "0.8", package = "candle-flash-attn" } half = { version = "2.3.1", features = ["num-traits"] } [patch.crates-io] -cudarc = { git = "https://github.com/Narsil/cudarc" , rev = "1956436aeddea1da04fc3226282bc07c07eeaa35"} -candle = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-core" } -candle-nn = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-nn" } -candle-transformers = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-transformers" } -candle-flash-attn = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-flash-attn" } +cudarc = { git = "https://github.com/Narsil/cudarc" , rev = "18ae111a4e8779c11377636b9cc3379f686e99c6"} +candle = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-core" } +candle-nn = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-nn" } +candle-transformers = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-transformers" } +candle-flash-attn = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-flash-attn" } +# candle = { path = "../candle/candle-core", package = "candle-core" } +# candle-nn = { path = "../candle/candle-nn" } +# candle-flash-attn = { path = "../candle/candle-flash-attn" } [profile.release] debug = 0 diff --git a/Dockerfile b/Dockerfile index 7023d409..978fcbc0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ COPY --from=planner /usr/src/recipe.json recipe.json RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo chef cook --release --features ort --features candle --features mkl-dynamic --no-default-features --recipe-path recipe.json && sccache -s + cargo chef cook --release --features ort,candle,mkl --no-default-features --recipe-path recipe.json && sccache -s COPY backends backends COPY core core @@ -56,7 +56,7 @@ FROM builder AS http-builder RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo build --release --bin text-embeddings-router -F ort -F candle -F mkl-dynamic -F http --no-default-features && sccache -s + cargo build --release --bin text-embeddings-router --features ort,candle,mkl,http --no-default-features && sccache -s FROM builder AS grpc-builder @@ -70,7 +70,7 @@ COPY proto proto RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo build --release --bin text-embeddings-router -F grpc -F ort -F candle -F mkl-dynamic --no-default-features && sccache -s + cargo build --release --bin text-embeddings-router --features ort,candle,mkl,grpc --no-default-features && sccache -s FROM debian:bookworm-slim AS base diff --git a/backends/Cargo.toml b/backends/Cargo.toml index 13b07fae..f9f020ff 100644 --- a/backends/Cargo.toml +++ b/backends/Cargo.toml @@ -25,7 +25,6 @@ candle = ["dep:text-embeddings-backend-candle"] cuda = ["text-embeddings-backend-candle?/cuda"] metal = ["text-embeddings-backend-candle?/metal"] mkl = ["text-embeddings-backend-candle?/mkl"] -mkl-dynamic = ["text-embeddings-backend-candle?/mkl-dynamic"] accelerate = ["text-embeddings-backend-candle?/accelerate"] flash-attn = ["text-embeddings-backend-candle?/flash-attn"] flash-attn-v1 = ["text-embeddings-backend-candle?/flash-attn-v1"] diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml index 68941547..88068d00 100644 --- a/backends/candle/Cargo.toml +++ b/backends/candle/Cargo.toml @@ -8,7 +8,7 @@ homepage.workspace = true [dependencies] anyhow = { workspace = true } accelerate-src = { version = "0.3.2", optional = true } -intel-mkl-src = { version = "0.8.1", optional = true } +intel-mkl-src = { workspace = true, optional = true } candle = { workspace = true } candle-nn = { workspace = true } candle-transformers = { workspace = true } @@ -40,8 +40,7 @@ anyhow = { version = "1", features = ["backtrace"] } [features] accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"] metal = ["candle/metal", "candle-nn/metal"] -mkl = ["dep:intel-mkl-src", "intel-mkl-src/mkl-static-lp64-iomp", "candle/mkl"] -mkl-dynamic = ["dep:intel-mkl-src", "intel-mkl-src/mkl-dynamic-lp64-iomp", "candle/mkl"] -cuda = ["candle/cuda", "candle-nn/cuda", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary"] +mkl = ["dep:intel-mkl-src", "candle/_mkl"] +cuda = ["candle/_cuda", "candle-nn/_cuda", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary"] flash-attn-v1 = ["dep:candle-flash-attn-v1", "cuda"] flash-attn = ["dep:candle-flash-attn", "cuda"] diff --git a/backends/candle/src/models/mod.rs b/backends/candle/src/models/mod.rs index 9c67ae8b..f2593736 100644 --- a/backends/candle/src/models/mod.rs +++ b/backends/candle/src/models/mod.rs @@ -1,4 +1,4 @@ -#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))] +#[cfg(feature = "mkl")] extern crate intel_mkl_src; #[cfg(feature = "accelerate")] diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs index 3b08e92a..80292be7 100644 --- a/backends/src/dtype.rs +++ b/backends/src/dtype.rs @@ -38,19 +38,13 @@ impl fmt::Display for DType { #[allow(clippy::derivable_impls)] impl Default for DType { fn default() -> Self { - #[cfg(any( - feature = "accelerate", - feature = "mkl", - feature = "mkl-dynamic", - feature = "ort" - ))] + #[cfg(any(feature = "accelerate", feature = "mkl", feature = "ort"))] { DType::Float32 } #[cfg(not(any( feature = "accelerate", feature = "mkl", - feature = "mkl-dynamic", feature = "ort", feature = "python" )))] diff --git a/candle-extensions/candle-cublaslt/Cargo.toml b/candle-extensions/candle-cublaslt/Cargo.toml index 48c6b345..500eac77 100644 --- a/candle-extensions/candle-cublaslt/Cargo.toml +++ b/candle-extensions/candle-cublaslt/Cargo.toml @@ -6,6 +6,6 @@ edition = "2021" description = "CUBLASLt gemm for the candle ML framework." [dependencies] -candle = { workspace=true, features = ["cuda"]} +candle = { workspace=true, features = ["_cuda"]} cudarc = { workspace = true, features = [ "cublaslt", "f16" ]} half = { workspace = true} diff --git a/candle-extensions/candle-flash-attn-v1/build.rs b/candle-extensions/candle-flash-attn-v1/build.rs index 2722045a..b22fd74e 100644 --- a/candle-extensions/candle-flash-attn-v1/build.rs +++ b/candle-extensions/candle-flash-attn-v1/build.rs @@ -3,15 +3,39 @@ // variable in order to cache the compiled artifacts and avoid recompiling too often. use anyhow::{Context, Result}; use rayon::prelude::*; +use std::fs; use std::path::PathBuf; use std::str::FromStr; -const KERNEL_FILES: [&str; 4] = [ - "flash_api.cu", - "fmha_fwd_hdim32.cu", - "fmha_fwd_hdim64.cu", - "fmha_fwd_hdim128.cu", -]; +// const KERNEL_FILES: [&str; 4] = [ +// "flash_api.cu", +// "fmha_fwd_hdim32.cu", +// "fmha_fwd_hdim64.cu", +// "fmha_fwd_hdim128.cu", +// ]; + +/// Recursively reads the filenames in a directory and stores them in a Vec. +fn _read_dir_recursively(dir_path: &PathBuf, paths: &mut Vec) -> std::io::Result<()> { + for entry in fs::read_dir(dir_path)? { + let entry = entry?; + let path = entry.path(); + + if path.is_dir() { + _read_dir_recursively(&path, paths)?; + } else { + paths.push(path); + } + } + + Ok(()) +} + +/// Recursively reads the filenames in a directory and stores them in a Vec. +fn read_dir_recursively(dir_path: &PathBuf) -> std::io::Result> { + let mut paths = Vec::new(); + _read_dir_recursively(dir_path, &mut paths)?; + Ok(paths) +} fn main() -> Result<()> { let num_cpus = std::env::var("RAYON_NUM_THREADS").map_or_else( @@ -25,12 +49,11 @@ fn main() -> Result<()> { .unwrap(); println!("cargo:rerun-if-changed=build.rs"); - for kernel_file in KERNEL_FILES.iter() { - println!("cargo:rerun-if-changed=kernels/{kernel_file}"); + + let paths = read_dir_recursively(&PathBuf::from_str("kernels")?)?; + for file in paths.iter() { + println!("cargo:rerun-if-changed={}", file.display()); } - println!("cargo:rerun-if-changed=kernels/**.h"); - println!("cargo:rerun-if-changed=kernels/**.cuh"); - println!("cargo:rerun-if-changed=kernels/fmha/**.h"); let out_dir = PathBuf::from(std::env::var("OUT_DIR").context("OUT_DIR not set")?); let build_dir = match std::env::var("CANDLE_FLASH_ATTN_BUILD_DIR") { Err(_) => @@ -57,12 +80,17 @@ fn main() -> Result<()> { let out_file = build_dir.join("libflashattentionv1.a"); let kernel_dir = PathBuf::from("kernels"); - let cu_files: Vec<_> = KERNEL_FILES + let kernels: Vec<_> = paths + .iter() + .filter(|f| f.extension().map(|ext| ext == "cu").unwrap_or_default()) + .collect(); + let cu_files: Vec<_> = kernels .iter() .map(|f| { let mut obj_file = out_dir.join(f); + fs::create_dir_all(obj_file.parent().unwrap()).unwrap(); obj_file.set_extension("o"); - (kernel_dir.join(f), obj_file) + (f, obj_file) }) .collect(); let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified()); diff --git a/candle-extensions/candle-layer-norm/Cargo.toml b/candle-extensions/candle-layer-norm/Cargo.toml index 8ab1b6d3..934fb264 100644 --- a/candle-extensions/candle-layer-norm/Cargo.toml +++ b/candle-extensions/candle-layer-norm/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" description = "Layer Norm layer for the candle ML framework." [dependencies] -candle = { workspace = true, features = ["cuda"] } +candle = { workspace = true, features = ["_cuda"] } half = { workspace = true } [build-dependencies] diff --git a/candle-extensions/candle-layer-norm/build.rs b/candle-extensions/candle-layer-norm/build.rs index d7a78038..9a37f339 100644 --- a/candle-extensions/candle-layer-norm/build.rs +++ b/candle-extensions/candle-layer-norm/build.rs @@ -23,8 +23,8 @@ fn main() -> Result<()> { for kernel_file in KERNEL_FILES.iter() { println!("cargo:rerun-if-changed=kernels/{kernel_file}"); } - println!("cargo:rerun-if-changed=kernels/**.cu"); println!("cargo:rerun-if-changed=kernels/ln_fwd_kernels.cuh"); + println!("cargo:rerun-if-changed=kernels/ln.h"); println!("cargo:rerun-if-changed=kernels/ln_kernel_traits.h"); println!("cargo:rerun-if-changed=kernels/ln_utils.cuh"); println!("cargo:rerun-if-changed=kernels/static_switch.h"); @@ -176,6 +176,8 @@ fn set_cuda_include_dir() -> Result<()> { .chain(roots) .find(|path| path.join("include").join("cuda.h").is_file()) .context("cannot find include/cuda.h")?; + println!("cargo:rustc-link-search={}", root.join("lib").display()); + println!("cargo:rustc-link-search={}", root.join("lib64").display()); println!( "cargo:rustc-env=CUDA_INCLUDE_DIR={}", root.join("include").display() diff --git a/candle-extensions/candle-rotary/Cargo.toml b/candle-extensions/candle-rotary/Cargo.toml index 4713e438..f387e90f 100644 --- a/candle-extensions/candle-rotary/Cargo.toml +++ b/candle-extensions/candle-rotary/Cargo.toml @@ -10,7 +10,7 @@ license = "MIT OR Apache-2.0" readme = "README.md" [dependencies] -candle = { workspace = true, features = ["cuda"]} +candle = { workspace = true, features = ["_cuda"]} half = { workspace = true } [build-dependencies] diff --git a/router/Cargo.toml b/router/Cargo.toml index 9dfa9aa6..4da04e66 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -59,6 +59,7 @@ tokio-stream = { version = "0.1.14", optional = true } # Optional cudarc = { workspace = true, optional = true } +intel-mkl-src = { workspace = true, optional = true } # Malloc trim hack for linux [target.'cfg(target_os = "linux")'.dependencies] @@ -78,12 +79,11 @@ vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] } tonic-build = { version = "0.11.0", optional = true } [features] -default = ["candle", "http"] +default = ["candle", "http", "dynamic-linking"] http = ["dep:axum", "dep:axum-tracing-opentelemetry", "dep:base64", "dep:tower-http", "dep:utoipa", "dep:utoipa-swagger-ui"] grpc = ["metrics-exporter-prometheus/http-listener", "dep:prost", "dep:tonic", "dep:tonic-health", "dep:tonic-reflection", "dep:tonic-build", "dep:async-stream", "dep:tokio-stream"] metal = ["text-embeddings-backend/metal"] mkl = ["text-embeddings-backend/mkl"] -mkl-dynamic = ["text-embeddings-backend/mkl-dynamic"] accelerate = ["text-embeddings-backend/accelerate"] python = ["text-embeddings-backend/python"] ort = ["text-embeddings-backend/ort"] @@ -91,5 +91,6 @@ candle = ["text-embeddings-backend/candle"] candle-cuda = ["candle", "text-embeddings-backend/flash-attn"] candle-cuda-turing = ["candle", "text-embeddings-backend/flash-attn-v1"] candle-cuda-volta = ["candle", "text-embeddings-backend/cuda"] -static-linking = ["cudarc/static-linking"] +static-linking = ["cudarc?/static-linking", "intel-mkl-src?/mkl-static-lp64-iomp"] +dynamic-linking = ["cudarc?/dynamic-linking", "intel-mkl-src?/mkl-dynamic-lp64-iomp"] google = []