diff --git a/Cargo.lock b/Cargo.lock
index 5d6c3bf8..d3101648 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -495,7 +495,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
 [[package]]
 name = "candle-core"
 version = "0.8.4"
-source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e"
+source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539"
 dependencies = [
  "accelerate-src",
  "byteorder",
@@ -534,7 +534,7 @@ dependencies = [
 [[package]]
 name = "candle-flash-attn"
 version = "0.8.4"
-source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e"
+source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539"
 dependencies = [
  "anyhow",
  "bindgen_cuda",
@@ -557,7 +557,7 @@ dependencies = [
 [[package]]
 name = "candle-kernels"
 version = "0.8.4"
-source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e"
+source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539"
 dependencies = [
  "bindgen_cuda",
 ]
@@ -576,7 +576,7 @@ dependencies = [
 [[package]]
 name = "candle-metal-kernels"
 version = "0.8.4"
-source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e"
+source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539"
 dependencies = [
  "metal 0.27.0",
  "once_cell",
@@ -587,7 +587,7 @@ dependencies = [
 [[package]]
 name = "candle-nn"
 version = "0.8.4"
-source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e"
+source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539"
 dependencies = [
  "accelerate-src",
  "candle-core",
@@ -615,7 +615,7 @@ dependencies = [
 [[package]]
 name = "candle-transformers"
 version = "0.8.4"
-source = "git+https://github.com/Narsil/candle?rev=2e273ddf31b1b796d3cfcd181ccb98deaa48466e#2e273ddf31b1b796d3cfcd181ccb98deaa48466e"
+source = "git+https://github.com/huggingface/candle?rev=ec6d7ca7738f4052b6613edc8f4d2bb6866a7539#ec6d7ca7738f4052b6613edc8f4d2bb6866a7539"
 dependencies = [
  "byteorder",
  "candle-core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "cudarc"
 version = "0.13.5"
-source = "git+https://github.com/Narsil/cudarc?rev=1956436aeddea1da04fc3226282bc07c07eeaa35#1956436aeddea1da04fc3226282bc07c07eeaa35"
+source = "git+https://github.com/Narsil/cudarc?rev=18ae111a4e8779c11377636b9cc3379f686e99c6#18ae111a4e8779c11377636b9cc3379f686e99c6"
 dependencies = [
  "half",
  "libloading",
@@ -4550,6 +4550,7 @@ dependencies = [
  "http 1.3.1",
  "init-tracing-opentelemetry",
  "insta",
+ "intel-mkl-src",
  "is_close",
  "libc",
  "metrics",
diff --git a/Cargo.toml b/Cargo.toml
index 8b07d18b..8001617c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,7 @@ thiserror = "1.0"
 rand = "0.9"
 serial_test = "2.0.0"
 cudarc = { version = "0.13" , features =["cuda-12020"]}
+intel-mkl-src = { version = "0.8" }
 candle = { version = "0.8", package = "candle-core" }
 candle-nn = { version = "0.8", package = "candle-nn" }
 candle-transformers = { version = "0.8", package = "candle-transformers" }
@@ -54,11 +55,14 @@ candle-flash-attn = { version = "0.8", package = "candle-flash-attn" }
 half = { version = "2.3.1", features = ["num-traits"] }
 
 [patch.crates-io]
-cudarc = { git = "https://github.com/Narsil/cudarc" , rev = "1956436aeddea1da04fc3226282bc07c07eeaa35"}
-candle = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-core" }
-candle-nn = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-nn" }
-candle-transformers = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-transformers" }
-candle-flash-attn = { git = "https://github.com/Narsil/candle", rev = "2e273ddf31b1b796d3cfcd181ccb98deaa48466e", package = "candle-flash-attn" }
+cudarc = { git = "https://github.com/Narsil/cudarc" , rev = "18ae111a4e8779c11377636b9cc3379f686e99c6"}
+candle = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-core" }
+candle-nn = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-nn" }
+candle-transformers = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-transformers" }
+candle-flash-attn = { git = "https://github.com/huggingface/candle", rev = "ec6d7ca7738f4052b6613edc8f4d2bb6866a7539", package = "candle-flash-attn" }
+# candle = { path = "../candle/candle-core", package = "candle-core" }
+# candle-nn = { path = "../candle/candle-nn" }
+# candle-flash-attn = { path = "../candle/candle-flash-attn" }
 
 [profile.release]
 debug = 0
diff --git a/Dockerfile b/Dockerfile
index 7023d409..978fcbc0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,7 +44,7 @@ COPY --from=planner /usr/src/recipe.json recipe.json
 
 RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-     cargo chef cook --release --features ort --features candle --features mkl-dynamic --no-default-features --recipe-path recipe.json && sccache -s
+     cargo chef cook --release --features ort,candle,mkl --no-default-features --recipe-path recipe.json && sccache -s
 
 COPY backends backends
 COPY core core
@@ -56,7 +56,7 @@ FROM builder AS http-builder
 
 RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo build --release --bin text-embeddings-router -F ort -F candle -F mkl-dynamic -F http --no-default-features && sccache -s
+    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,http --no-default-features && sccache -s
 
 FROM builder AS grpc-builder
 
@@ -70,7 +70,7 @@ COPY proto proto
 
 RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo build --release --bin text-embeddings-router -F grpc -F ort -F candle -F mkl-dynamic --no-default-features && sccache -s
+    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,grpc --no-default-features && sccache -s
 
 FROM debian:bookworm-slim AS base
 
diff --git a/backends/Cargo.toml b/backends/Cargo.toml
index 13b07fae..f9f020ff 100644
--- a/backends/Cargo.toml
+++ b/backends/Cargo.toml
@@ -25,7 +25,6 @@ candle = ["dep:text-embeddings-backend-candle"]
 cuda = ["text-embeddings-backend-candle?/cuda"]
 metal = ["text-embeddings-backend-candle?/metal"]
 mkl = ["text-embeddings-backend-candle?/mkl"]
-mkl-dynamic = ["text-embeddings-backend-candle?/mkl-dynamic"]
 accelerate = ["text-embeddings-backend-candle?/accelerate"]
 flash-attn = ["text-embeddings-backend-candle?/flash-attn"]
 flash-attn-v1 = ["text-embeddings-backend-candle?/flash-attn-v1"]
diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml
index 68941547..88068d00 100644
--- a/backends/candle/Cargo.toml
+++ b/backends/candle/Cargo.toml
@@ -8,7 +8,7 @@ homepage.workspace = true
 [dependencies]
 anyhow = { workspace = true }
 accelerate-src = { version = "0.3.2", optional = true }
-intel-mkl-src = { version = "0.8.1", optional = true  }
+intel-mkl-src = { workspace = true, optional = true  }
 candle = { workspace = true }
 candle-nn = { workspace = true }
 candle-transformers = { workspace = true }
@@ -40,8 +40,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 [features]
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"]
 metal = ["candle/metal", "candle-nn/metal"]
-mkl = ["dep:intel-mkl-src", "intel-mkl-src/mkl-static-lp64-iomp", "candle/mkl"]
-mkl-dynamic = ["dep:intel-mkl-src", "intel-mkl-src/mkl-dynamic-lp64-iomp", "candle/mkl"]
-cuda = ["candle/cuda", "candle-nn/cuda", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary"]
+mkl = ["dep:intel-mkl-src", "candle/_mkl"]
+cuda = ["candle/_cuda", "candle-nn/_cuda", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary"]
 flash-attn-v1 = ["dep:candle-flash-attn-v1", "cuda"]
 flash-attn = ["dep:candle-flash-attn", "cuda"]
diff --git a/backends/candle/src/models/mod.rs b/backends/candle/src/models/mod.rs
index 9c67ae8b..f2593736 100644
--- a/backends/candle/src/models/mod.rs
+++ b/backends/candle/src/models/mod.rs
@@ -1,4 +1,4 @@
-#[cfg(any(feature = "mkl", feature = "mkl-dynamic"))]
+#[cfg(feature = "mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs
index 3b08e92a..80292be7 100644
--- a/backends/src/dtype.rs
+++ b/backends/src/dtype.rs
@@ -38,19 +38,13 @@ impl fmt::Display for DType {
 #[allow(clippy::derivable_impls)]
 impl Default for DType {
     fn default() -> Self {
-        #[cfg(any(
-            feature = "accelerate",
-            feature = "mkl",
-            feature = "mkl-dynamic",
-            feature = "ort"
-        ))]
+        #[cfg(any(feature = "accelerate", feature = "mkl", feature = "ort"))]
         {
             DType::Float32
         }
         #[cfg(not(any(
             feature = "accelerate",
             feature = "mkl",
-            feature = "mkl-dynamic",
             feature = "ort",
             feature = "python"
         )))]
diff --git a/candle-extensions/candle-cublaslt/Cargo.toml b/candle-extensions/candle-cublaslt/Cargo.toml
index 48c6b345..500eac77 100644
--- a/candle-extensions/candle-cublaslt/Cargo.toml
+++ b/candle-extensions/candle-cublaslt/Cargo.toml
@@ -6,6 +6,6 @@ edition = "2021"
 description = "CUBLASLt gemm for the candle ML framework."
 
 [dependencies]
-candle = { workspace=true, features = ["cuda"]}
+candle = { workspace=true, features = ["_cuda"]}
 cudarc = { workspace = true, features = [ "cublaslt", "f16" ]}
 half = { workspace = true}
diff --git a/candle-extensions/candle-flash-attn-v1/build.rs b/candle-extensions/candle-flash-attn-v1/build.rs
index 2722045a..b22fd74e 100644
--- a/candle-extensions/candle-flash-attn-v1/build.rs
+++ b/candle-extensions/candle-flash-attn-v1/build.rs
@@ -3,15 +3,39 @@
 // variable in order to cache the compiled artifacts and avoid recompiling too often.
 use anyhow::{Context, Result};
 use rayon::prelude::*;
+use std::fs;
 use std::path::PathBuf;
 use std::str::FromStr;
 
-const KERNEL_FILES: [&str; 4] = [
-    "flash_api.cu",
-    "fmha_fwd_hdim32.cu",
-    "fmha_fwd_hdim64.cu",
-    "fmha_fwd_hdim128.cu",
-];
+// const KERNEL_FILES: [&str; 4] = [
+//     "flash_api.cu",
+//     "fmha_fwd_hdim32.cu",
+//     "fmha_fwd_hdim64.cu",
+//     "fmha_fwd_hdim128.cu",
+// ];
+
+/// Recursively reads the filenames in a directory and stores them in a Vec.
+fn _read_dir_recursively(dir_path: &PathBuf, paths: &mut Vec<PathBuf>) -> std::io::Result<()> {
+    for entry in fs::read_dir(dir_path)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() {
+            _read_dir_recursively(&path, paths)?;
+        } else {
+            paths.push(path);
+        }
+    }
+
+    Ok(())
+}
+
+/// Recursively reads the filenames in a directory and stores them in a Vec.
+fn read_dir_recursively(dir_path: &PathBuf) -> std::io::Result<Vec<PathBuf>> {
+    let mut paths = Vec::new();
+    _read_dir_recursively(dir_path, &mut paths)?;
+    Ok(paths)
+}
 
 fn main() -> Result<()> {
     let num_cpus = std::env::var("RAYON_NUM_THREADS").map_or_else(
@@ -25,12 +49,11 @@ fn main() -> Result<()> {
         .unwrap();
 
     println!("cargo:rerun-if-changed=build.rs");
-    for kernel_file in KERNEL_FILES.iter() {
-        println!("cargo:rerun-if-changed=kernels/{kernel_file}");
+
+    let paths = read_dir_recursively(&PathBuf::from_str("kernels")?)?;
+    for file in paths.iter() {
+        println!("cargo:rerun-if-changed={}", file.display());
     }
-    println!("cargo:rerun-if-changed=kernels/**.h");
-    println!("cargo:rerun-if-changed=kernels/**.cuh");
-    println!("cargo:rerun-if-changed=kernels/fmha/**.h");
     let out_dir = PathBuf::from(std::env::var("OUT_DIR").context("OUT_DIR not set")?);
     let build_dir = match std::env::var("CANDLE_FLASH_ATTN_BUILD_DIR") {
         Err(_) =>
@@ -57,12 +80,17 @@ fn main() -> Result<()> {
     let out_file = build_dir.join("libflashattentionv1.a");
 
     let kernel_dir = PathBuf::from("kernels");
-    let cu_files: Vec<_> = KERNEL_FILES
+    let kernels: Vec<_> = paths
+        .iter()
+        .filter(|f| f.extension().map(|ext| ext == "cu").unwrap_or_default())
+        .collect();
+    let cu_files: Vec<_> = kernels
         .iter()
         .map(|f| {
             let mut obj_file = out_dir.join(f);
+            fs::create_dir_all(obj_file.parent().unwrap()).unwrap();
             obj_file.set_extension("o");
-            (kernel_dir.join(f), obj_file)
+            (f, obj_file)
         })
         .collect();
     let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified());
diff --git a/candle-extensions/candle-layer-norm/Cargo.toml b/candle-extensions/candle-layer-norm/Cargo.toml
index 8ab1b6d3..934fb264 100644
--- a/candle-extensions/candle-layer-norm/Cargo.toml
+++ b/candle-extensions/candle-layer-norm/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 description = "Layer Norm layer for the candle ML framework."
 
 [dependencies]
-candle = { workspace = true, features = ["cuda"] }
+candle = { workspace = true, features = ["_cuda"] }
 half = { workspace = true }
 
 [build-dependencies]
diff --git a/candle-extensions/candle-layer-norm/build.rs b/candle-extensions/candle-layer-norm/build.rs
index d7a78038..9a37f339 100644
--- a/candle-extensions/candle-layer-norm/build.rs
+++ b/candle-extensions/candle-layer-norm/build.rs
@@ -23,8 +23,8 @@ fn main() -> Result<()> {
     for kernel_file in KERNEL_FILES.iter() {
         println!("cargo:rerun-if-changed=kernels/{kernel_file}");
     }
-    println!("cargo:rerun-if-changed=kernels/**.cu");
     println!("cargo:rerun-if-changed=kernels/ln_fwd_kernels.cuh");
+    println!("cargo:rerun-if-changed=kernels/ln.h");
     println!("cargo:rerun-if-changed=kernels/ln_kernel_traits.h");
     println!("cargo:rerun-if-changed=kernels/ln_utils.cuh");
     println!("cargo:rerun-if-changed=kernels/static_switch.h");
@@ -176,6 +176,8 @@ fn set_cuda_include_dir() -> Result<()> {
         .chain(roots)
         .find(|path| path.join("include").join("cuda.h").is_file())
         .context("cannot find include/cuda.h")?;
+    println!("cargo:rustc-link-search={}", root.join("lib").display());
+    println!("cargo:rustc-link-search={}", root.join("lib64").display());
     println!(
         "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
         root.join("include").display()
diff --git a/candle-extensions/candle-rotary/Cargo.toml b/candle-extensions/candle-rotary/Cargo.toml
index 4713e438..f387e90f 100644
--- a/candle-extensions/candle-rotary/Cargo.toml
+++ b/candle-extensions/candle-rotary/Cargo.toml
@@ -10,7 +10,7 @@ license = "MIT OR Apache-2.0"
 readme = "README.md"
 
 [dependencies]
-candle = { workspace = true, features = ["cuda"]}
+candle = { workspace = true, features = ["_cuda"]}
 half = { workspace = true }
 
 [build-dependencies]
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 9dfa9aa6..4da04e66 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -59,6 +59,7 @@ tokio-stream = { version = "0.1.14", optional = true }
 
 # Optional
 cudarc = { workspace = true, optional = true }
+intel-mkl-src = { workspace = true, optional = true }
 
 # Malloc trim hack for linux
 [target.'cfg(target_os = "linux")'.dependencies]
@@ -78,12 +79,11 @@ vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
 tonic-build = { version = "0.11.0", optional = true }
 
 [features]
-default = ["candle", "http"]
+default = ["candle", "http", "dynamic-linking"]
 http = ["dep:axum", "dep:axum-tracing-opentelemetry", "dep:base64", "dep:tower-http", "dep:utoipa", "dep:utoipa-swagger-ui"]
 grpc = ["metrics-exporter-prometheus/http-listener", "dep:prost", "dep:tonic", "dep:tonic-health", "dep:tonic-reflection", "dep:tonic-build", "dep:async-stream", "dep:tokio-stream"]
 metal = ["text-embeddings-backend/metal"]
 mkl = ["text-embeddings-backend/mkl"]
-mkl-dynamic = ["text-embeddings-backend/mkl-dynamic"]
 accelerate = ["text-embeddings-backend/accelerate"]
 python = ["text-embeddings-backend/python"]
 ort = ["text-embeddings-backend/ort"]
@@ -91,5 +91,6 @@ candle = ["text-embeddings-backend/candle"]
 candle-cuda = ["candle", "text-embeddings-backend/flash-attn"]
 candle-cuda-turing = ["candle", "text-embeddings-backend/flash-attn-v1"]
 candle-cuda-volta = ["candle", "text-embeddings-backend/cuda"]
-static-linking = ["cudarc/static-linking"]
+static-linking = ["cudarc?/static-linking", "intel-mkl-src?/mkl-static-lp64-iomp"]
+dynamic-linking = ["cudarc?/dynamic-linking", "intel-mkl-src?/mkl-dynamic-lp64-iomp"]
 google = []