From 8f2642239f28569ef91a7525fca64f02925507a6 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 31 Jul 2025 12:44:31 +0200 Subject: [PATCH 1/4] fix: Add startupProbe to prevent Superset startup problems --- CHANGELOG.md | 5 +++ .../src/superset_controller.rs | 45 ++++++++++++------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a1c6edb..6c892534 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ## [Unreleased] +### Fixed + +- Fix container not starting because Superset was starting too slow and was killed because a failing liveness probe. +We now add a proper startup probe, which allows Superset to take longer to start up ([#XXX]). + ## [25.7.0] - 2025-07-23 ## [25.7.0-rc1] - 2025-07-18 diff --git a/rust/operator-binary/src/superset_controller.rs b/rust/operator-binary/src/superset_controller.rs index 8e58aa8a..0da28232 100644 --- a/rust/operator-binary/src/superset_controller.rs +++ b/rust/operator-binary/src/superset_controller.rs @@ -786,21 +786,7 @@ fn build_server_rolegroup_statefulset( create_vector_shutdown_file_command(STACKABLE_LOG_DIR), }]) .resources(merged_config.resources.clone().into()); - let probe = Probe { - http_get: Some(HTTPGetAction { - port: IntOrString::Int(APP_PORT.into()), - path: Some("/health".to_string()), - ..HTTPGetAction::default() - }), - initial_delay_seconds: Some(15), - period_seconds: Some(15), - timeout_seconds: Some(1), - failure_threshold: Some(3), - success_threshold: Some(1), - ..Probe::default() - }; - superset_cb.readiness_probe(probe.clone()); - superset_cb.liveness_probe(probe); + add_superset_container_probes(&mut superset_cb); // listener endpoints will use persistent volumes // so that load balancers can hard-code the target addresses and @@ -932,6 +918,35 @@ fn build_server_rolegroup_statefulset( }) } +fn add_superset_container_probes(superset_cb: &mut ContainerBuilder) { + let probe_action = HTTPGetAction { + port: IntOrString::Int(APP_PORT.into()), + path: Some("/health".to_string()), + ..HTTPGetAction::default() + }; + let common_probe = Probe { + http_get: Some(probe_action), + period_seconds: Some(5), + timeout_seconds: Some(5), + success_threshold: Some(1), + ..Probe::default() + }; + superset_cb.startup_probe(Probe { + failure_threshold: Some(10 /* minutes */ * 60 / 5), + ..common_probe.clone() + }); + // Remove it from the Service immediately + superset_cb.readiness_probe(Probe { + failure_threshold: Some(1), + ..common_probe.clone() + }); + // But only restart it after 3 failures + superset_cb.readiness_probe(Probe { + failure_threshold: Some(3), + ..common_probe + }); +} + fn add_authentication_volumes_and_volume_mounts( auth_config: &SupersetClientAuthenticationDetailsResolved, cb: &mut ContainerBuilder, From 91e4392f9e19c6c946210d45df1e7781a6352f42 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 31 Jul 2025 12:47:08 +0200 Subject: [PATCH 2/4] changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c892534..4dcded76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,9 @@ ### Fixed - Fix container not starting because Superset was starting too slow and was killed because a failing liveness probe. -We now add a proper startup probe, which allows Superset to take longer to start up ([#XXX]). +We now add a proper startup probe, which allows Superset to take longer to start up ([#654]). + +[#654]: https://github.com/stackabletech/superset-operator/pull/654 ## [25.7.0] - 2025-07-23 From 8edc0a11e80ffd7b990a654e209d1dff7ce1cdad Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Mon, 4 Aug 2025 09:58:51 +0200 Subject: [PATCH 3/4] Update CHANGELOG.md Co-authored-by: Nick <10092581+NickLarsenNZ@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dcded76..7bfb0aec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ ### Fixed - Fix container not starting because Superset was starting too slow and was killed because a failing liveness probe. -We now add a proper startup probe, which allows Superset to take longer to start up ([#654]). + We now add a proper startup probe, which allows Superset to take longer to start up ([#654]). [#654]: https://github.com/stackabletech/superset-operator/pull/654 From c5c5a7ebf03ac2b5298e3f4cb0a9661d28b86fe3 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 21 Aug 2025 15:02:03 +0200 Subject: [PATCH 4/4] Use new ProbeBuilder --- Cargo.lock | 16 +++--- Cargo.toml | 4 +- .../src/superset_controller.rs | 54 ++++++++++--------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5d8d57b3..59433be5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1459,7 +1459,7 @@ dependencies = [ [[package]] name = "k8s-version" version = "0.1.3" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "darling 0.21.2", "regex", @@ -2623,8 +2623,8 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stackable-operator" -version = "0.95.0" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +version = "0.95.1" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "chrono", "clap", @@ -2662,7 +2662,7 @@ dependencies = [ [[package]] name = "stackable-operator-derive" version = "0.3.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "darling 0.21.2", "proc-macro2", @@ -2673,7 +2673,7 @@ dependencies = [ [[package]] name = "stackable-shared" version = "0.0.2" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "k8s-openapi", "kube", @@ -2712,7 +2712,7 @@ dependencies = [ [[package]] name = "stackable-telemetry" version = "0.6.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "axum", "clap", @@ -2736,7 +2736,7 @@ dependencies = [ [[package]] name = "stackable-versioned" version = "0.8.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "schemars", "serde", @@ -2749,7 +2749,7 @@ dependencies = [ [[package]] name = "stackable-versioned-macros" version = "0.8.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "convert_case", "darling 0.21.2", diff --git a/Cargo.toml b/Cargo.toml index 76c40874..621b8063 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,6 @@ strum = { version = "0.27", features = ["derive"] } tokio = { version = "1.40", features = ["full"] } tracing = "0.1" -# [patch."https://github.com/stackabletech/operator-rs"] -# stackable-operator = { git = "https://github.com/stackabletech//operator-rs.git", branch = "main" } +[patch."https://github.com/stackabletech/operator-rs"] +stackable-operator = { git = "https://github.com/stackabletech//operator-rs.git", branch = "fix/probe-builder-clone" } # stackable-operator = { path = "../operator-rs/crates/stackable-operator" } diff --git a/rust/operator-binary/src/superset_controller.rs b/rust/operator-binary/src/superset_controller.rs index 5099afd2..6061593e 100644 --- a/rust/operator-binary/src/superset_controller.rs +++ b/rust/operator-binary/src/superset_controller.rs @@ -22,6 +22,7 @@ use stackable_operator::{ pod::{ PodBuilder, container::ContainerBuilder, + probe::ProbeBuilder, resources::ResourceRequirementsBuilder, security::PodSecurityContextBuilder, volume::{ @@ -40,9 +41,9 @@ use stackable_operator::{ DeepMerge, api::{ apps::v1::{StatefulSet, StatefulSetSpec}, - core::v1::{ConfigMap, EnvVar, HTTPGetAction, Probe}, + core::v1::{ConfigMap, EnvVar}, }, - apimachinery::pkg::{apis::meta::v1::LabelSelector, util::intstr::IntOrString}, + apimachinery::pkg::apis::meta::v1::LabelSelector, }, kube::{ Resource, ResourceExt, @@ -930,32 +931,33 @@ fn build_server_rolegroup_statefulset( } fn add_superset_container_probes(superset_cb: &mut ContainerBuilder) { - let probe_action = HTTPGetAction { - port: IntOrString::Int(APP_PORT.into()), - path: Some("/health".to_string()), - ..HTTPGetAction::default() - }; - let common_probe = Probe { - http_get: Some(probe_action), - period_seconds: Some(5), - timeout_seconds: Some(5), - success_threshold: Some(1), - ..Probe::default() - }; - superset_cb.startup_probe(Probe { - failure_threshold: Some(10 /* minutes */ * 60 / 5), - ..common_probe.clone() - }); + let common = + ProbeBuilder::http_get_port_scheme_path(APP_PORT, None, Some("/health".to_owned())) + .with_period(Duration::from_secs(5)); + + superset_cb.startup_probe( + common + .clone() + .with_failure_threshold_duration(Duration::from_minutes_unchecked(10)) + .expect("static period is always non-zero") + .build() + .expect("static durations are not too long"), + ); + // Remove it from the Service immediately - superset_cb.readiness_probe(Probe { - failure_threshold: Some(1), - ..common_probe.clone() - }); + superset_cb.readiness_probe( + common + .clone() + .build() + .expect("static durations are not too long"), + ); // But only restart it after 3 failures - superset_cb.readiness_probe(Probe { - failure_threshold: Some(3), - ..common_probe - }); + superset_cb.liveness_probe( + common + .with_failure_threshold(3) + .build() + .expect("static durations are not too long"), + ); } fn add_authentication_volumes_and_volume_mounts(