diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a1c6edb..7bfb0aec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ ## [Unreleased] +### Fixed + +- Fix container not starting because Superset was starting too slow and was killed because a failing liveness probe. + We now add a proper startup probe, which allows Superset to take longer to start up ([#654]). + +[#654]: https://github.com/stackabletech/superset-operator/pull/654 + ## [25.7.0] - 2025-07-23 ## [25.7.0-rc1] - 2025-07-18 diff --git a/Cargo.lock b/Cargo.lock index 5d8d57b3..59433be5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1459,7 +1459,7 @@ dependencies = [ [[package]] name = "k8s-version" version = "0.1.3" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "darling 0.21.2", "regex", @@ -2623,8 +2623,8 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stackable-operator" -version = "0.95.0" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +version = "0.95.1" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "chrono", "clap", @@ -2662,7 +2662,7 @@ dependencies = [ [[package]] name = "stackable-operator-derive" version = "0.3.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "darling 0.21.2", "proc-macro2", @@ -2673,7 +2673,7 @@ dependencies = [ [[package]] name = "stackable-shared" version = "0.0.2" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "k8s-openapi", "kube", @@ -2712,7 +2712,7 @@ dependencies = [ [[package]] name = "stackable-telemetry" version = "0.6.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "axum", "clap", @@ -2736,7 +2736,7 @@ dependencies = [ [[package]] name = "stackable-versioned" version = "0.8.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "schemars", "serde", @@ -2749,7 +2749,7 @@ dependencies = [ [[package]] name = "stackable-versioned-macros" version = "0.8.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=stackable-operator-0.95.0#20659fe864c643fe48c7ff70ed417f0ed05ccf45" +source = "git+https://github.com/stackabletech//operator-rs.git?branch=fix%2Fprobe-builder-clone#426a44abdfb4baa5a860208e57969f45d8dd483f" dependencies = [ "convert_case", "darling 0.21.2", diff --git a/Cargo.toml b/Cargo.toml index 76c40874..621b8063 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,6 @@ strum = { version = "0.27", features = ["derive"] } tokio = { version = "1.40", features = ["full"] } tracing = "0.1" -# [patch."https://github.com/stackabletech/operator-rs"] -# stackable-operator = { git = "https://github.com/stackabletech//operator-rs.git", branch = "main" } +[patch."https://github.com/stackabletech/operator-rs"] +stackable-operator = { git = "https://github.com/stackabletech//operator-rs.git", branch = "fix/probe-builder-clone" } # stackable-operator = { path = "../operator-rs/crates/stackable-operator" } diff --git a/rust/operator-binary/src/superset_controller.rs b/rust/operator-binary/src/superset_controller.rs index 102ba19d..6061593e 100644 --- a/rust/operator-binary/src/superset_controller.rs +++ b/rust/operator-binary/src/superset_controller.rs @@ -22,6 +22,7 @@ use stackable_operator::{ pod::{ PodBuilder, container::ContainerBuilder, + probe::ProbeBuilder, resources::ResourceRequirementsBuilder, security::PodSecurityContextBuilder, volume::{ @@ -40,9 +41,9 @@ use stackable_operator::{ DeepMerge, api::{ apps::v1::{StatefulSet, StatefulSetSpec}, - core::v1::{ConfigMap, EnvVar, HTTPGetAction, Probe}, + core::v1::{ConfigMap, EnvVar}, }, - apimachinery::pkg::{apis::meta::v1::LabelSelector, util::intstr::IntOrString}, + apimachinery::pkg::apis::meta::v1::LabelSelector, }, kube::{ Resource, ResourceExt, @@ -797,21 +798,7 @@ fn build_server_rolegroup_statefulset( create_vector_shutdown_file_command(STACKABLE_LOG_DIR), }]) .resources(merged_config.resources.clone().into()); - let probe = Probe { - http_get: Some(HTTPGetAction { - port: IntOrString::Int(APP_PORT.into()), - path: Some("/health".to_string()), - ..HTTPGetAction::default() - }), - initial_delay_seconds: Some(15), - period_seconds: Some(15), - timeout_seconds: Some(1), - failure_threshold: Some(3), - success_threshold: Some(1), - ..Probe::default() - }; - superset_cb.readiness_probe(probe.clone()); - superset_cb.liveness_probe(probe); + add_superset_container_probes(&mut superset_cb); // listener endpoints will use persistent volumes // so that load balancers can hard-code the target addresses and @@ -943,6 +930,36 @@ fn build_server_rolegroup_statefulset( }) } +fn add_superset_container_probes(superset_cb: &mut ContainerBuilder) { + let common = + ProbeBuilder::http_get_port_scheme_path(APP_PORT, None, Some("/health".to_owned())) + .with_period(Duration::from_secs(5)); + + superset_cb.startup_probe( + common + .clone() + .with_failure_threshold_duration(Duration::from_minutes_unchecked(10)) + .expect("static period is always non-zero") + .build() + .expect("static durations are not too long"), + ); + + // Remove it from the Service immediately + superset_cb.readiness_probe( + common + .clone() + .build() + .expect("static durations are not too long"), + ); + // But only restart it after 3 failures + superset_cb.liveness_probe( + common + .with_failure_threshold(3) + .build() + .expect("static durations are not too long"), + ); +} + fn add_authentication_volumes_and_volume_mounts( auth_config: &SupersetClientAuthenticationDetailsResolved, cb: &mut ContainerBuilder,