Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,255 @@ tests:
- name: "example.com/quux"
request: "1"
expectedError: 'spec.metricsServerConfig.resources: Too many: 11: must have at most 10 items'
- name: Should be able to create a minimal PrometheusOperatorConfig
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
logLevel: "Info"
expected: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
logLevel: "Info"
- name: Should accept PrometheusOperatorConfig with valid nodeSelector
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/worker: ""
expected: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/worker: ""
- name: Should accept PrometheusOperatorConfig with valid resources
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
resources:
- name: "cpu"
request: "100m"
limit: "500m"
- name: "memory"
request: "128Mi"
limit: "512Mi"
expected: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
resources:
- name: "cpu"
request: "100m"
limit: "500m"
- name: "memory"
request: "128Mi"
limit: "512Mi"
- name: Should accept PrometheusOperatorConfig with valid tolerations
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
expected: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- name: Should accept PrometheusOperatorConfig with valid topologySpreadConstraints
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
topologySpreadConstraints:
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: prometheus-operator
- maxSkew: 2
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app: prometheus-operator
expected: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
topologySpreadConstraints:
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: prometheus-operator
- maxSkew: 2
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app: prometheus-operator
- name: Should reject PrometheusOperatorConfig with empty object
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig: {}
expectedError: 'spec.prometheusOperatorConfig: Invalid value: 0: spec.prometheusOperatorConfig in body should have at least 1 properties'
- name: Should reject PrometheusOperatorConfig with too many tolerations
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
tolerations:
- key: key1
operator: Exists
effect: NoSchedule
- key: key2
operator: Exists
effect: NoSchedule
- key: key3
operator: Exists
effect: NoSchedule
- key: key4
operator: Exists
effect: NoSchedule
- key: key5
operator: Exists
effect: NoSchedule
- key: key6
operator: Exists
effect: NoSchedule
- key: key7
operator: Exists
effect: NoSchedule
- key: key8
operator: Exists
effect: NoSchedule
- key: key9
operator: Exists
effect: NoSchedule
- key: key10
operator: Exists
effect: NoSchedule
- key: key11
operator: Exists
effect: NoSchedule
expectedError: 'spec.prometheusOperatorConfig.tolerations: Too many: 11: must have at most 10 items'
- name: Should reject PrometheusOperatorConfig with empty tolerations array
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
tolerations: []
expectedError: 'spec.prometheusOperatorConfig.tolerations: Invalid value: 0: spec.prometheusOperatorConfig.tolerations in body should have at least 1 items'
- name: Should reject PrometheusOperatorConfig with empty topologySpreadConstraints array
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
topologySpreadConstraints: []
expectedError: 'spec.prometheusOperatorConfig.topologySpreadConstraints: Invalid value: 0: spec.prometheusOperatorConfig.topologySpreadConstraints in body should have at least 1 items'
- name: Should reject PrometheusOperatorConfig with too many topologySpreadConstraints
initial: |
apiVersion: config.openshift.io/v1alpha1
kind: ClusterMonitoring
spec:
userDefined:
mode: "Disabled"
prometheusOperatorConfig:
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "zone1"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone2"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone3"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone4"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone5"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone6"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone7"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone8"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone9"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone10"
whenUnsatisfiable: DoNotSchedule
- maxSkew: 1
topologyKey: "zone11"
whenUnsatisfiable: DoNotSchedule
expectedError: 'spec.prometheusOperatorConfig.topologySpreadConstraints: Too many: 11: must have at most 10 items'
87 changes: 87 additions & 0 deletions config/v1alpha1/types_cluster_monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ type ClusterMonitoringSpec struct {
// When omitted, this means no opinion and the platform is left to choose a reasonable default, which is subject to change over time.
// +optional
MetricsServerConfig MetricsServerConfig `json:"metricsServerConfig,omitempty,omitzero"`
// prometheusOperatorConfig is an optional field that can be used to configure the Prometheus Operator component.
// Specifically, it can configure how the Prometheus Operator instance is deployed, pod scheduling, and resource allocation.
// When omitted, this means no opinion and the platform is left to choose a reasonable default, which is subject to change over time.
// +optional
PrometheusOperatorConfig PrometheusOperatorConfig `json:"prometheusOperatorConfig,omitempty,omitzero"`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this configuration relate to the configuration proposed in #2463?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is Prometheus Operator, the other one is Prometheus config. Of course they are related but they have different configs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the Prometheus config used by the PrometheusOperator?

Would it make sense to co-locate the configurations under a top-level prometheus field?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not directly . Prometheus Config is use by Prometheus. PrometheusOperator manages Prometheus instances, a
Alertmanagare, etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what configures the Prometheus instances created by the Prometheus Operator to use the Prometheus Config?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Who would be reading and acting on the PrometheusK8sConfig?
    It's CMO, not PrometheusOperator

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What will the CMO do with the PrometheusK8sConfig?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CMO takes PrometheusK8sConfing configmap and create a CR.
PrometheosOperator takes that CR and configure Prometheus.

I can understand your idea but PrometheusOperator manages all these components and I it's not a good idea to have all fields inside PrometheusOperator.

A core feature of the Prometheus Operator is to monitor the Kubernetes API server for changes to specific objects and ensure that the current Prometheus deployments match these objects. The Operator acts on the following [Custom Resource Definitions (CRDs)](https://kubernetes.io/docs/tasks/access-kubernetes-api/extend-api-custom-resource-definitions/):

    Prometheus, which defines a desired Prometheus deployment.

    PrometheusAgent, which defines a desired Prometheus deployment, but running in Agent mode.

    Alertmanager, which defines a desired Alertmanager deployment.

    ThanosRuler, which defines a desired Thanos Ruler deployment.

    ServiceMonitor, which declaratively specifies how groups of Kubernetes services should be monitored. The Operator automatically generates Prometheus scrape configuration based on the current state of the objects in the API server.

    PodMonitor, which declaratively specifies how group of pods should be monitored. The Operator automatically generates Prometheus scrape configuration based on the current state of the objects in the API server.

    Probe, which declaratively specifies how groups of ingresses or static targets should be monitored. The Operator automatically generates Prometheus scrape configuration based on the definition.

    ScrapeConfig, which declaratively specifies scrape configurations to be added to Prometheus. This CustomResourceDefinition helps with scraping resources outside the Kubernetes cluster.

    PrometheusRule, which defines a desired set of Prometheus alerting and/or recording rules. The Operator generates a rule file, which can be used by Prometheus instances.

    AlertmanagerConfig, which declaratively specifies subsections of the Alertmanager configuration, allowing routing of alerts to custom receivers, and setting inhibit rules.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So to make sure I am following along, the CMO will:

  • Deploy the PrometheusOperator based on the PrometheusOperatorConfig
  • Create Prometheus CRs using the configurations provided in PrometheusK8sConfig. Does this apply to all Prometheus CRs?

While these are two distinct things, they are both inherently related to how the CMO handles prometheus configuration on the cluster.

I can understand your idea but PrometheusOperator manages all these components and I it's not a good idea to have all fields inside PrometheusOperator.

I'm not suggesting that we put all the fields under PrometheusOperatorConfig, I'm suggesting we use a shared parent field named prometheus that can have sibling fields for configuring the Prometheus Operator itself and, separately, configuring the individual Prometheus instance configurations. This way, if you want to add additional configuration options related to prometheus in the future, you don't have to add another Prometheus* field.

Copy link
Contributor Author

@marioferh marioferh Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Deploy the PrometheusOperator based on the PrometheusOperatorConfig
  • Create Prometheus CRs using the configurations provided in PrometheusK8sConfig. Does this apply to all Prometheus CRs?

Correct

I'm not suggesting that we put all the fields under PrometheusOperatorConfig, I'm suggesting we use a shared parent field named prometheus that can have sibling fields for configuring the Prometheus Operator itself and, separately, configuring the individual Prometheus instance configurations. This way, if you want to add additional configuration options related to prometheus in the future, you don't have to add another Prometheus* field.

But they are different things, the are related but from my point of view and how CMO works it makes no sense.
https://github.com/prometheus-operator/prometheus-operator
https://github.com/prometheus/prometheus

@danielmellado @simonpasquier any thoughts?

}

// UserDefinedMonitoring config for user-defined projects.
Expand Down Expand Up @@ -416,6 +421,88 @@ type MetricsServerConfig struct {
TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
}

// PrometheusOperatorConfig provides configuration options for the Prometheus Operator instance
// Use this configuration to control how the Prometheus Operator instance is deployed, how it logs, and how its pods are scheduled.
// +kubebuilder:validation:MinProperties=1
type PrometheusOperatorConfig struct {
// logLevel defines the verbosity of logs emitted by Alertmanager.
// This field allows users to control the amount and severity of logs generated, which can be useful
// for debugging issues or reducing noise in production environments.
// Allowed values are Error, Warn, Info, and Debug.
// When set to Error, only errors will be logged.
// When set to Warn, both warnings and errors will be logged.
// When set to Info, general information, warnings, and errors will all be logged.
// When set to Debug, detailed debugging information will be logged.
// When omitted, this means no opinion and the platform is left to choose a reasonable default, that is subject to change over time.
// The current default value is `Info`.
// +optional
LogLevel LogLevel `json:"logLevel,omitempty"`
// nodeSelector defines the nodes on which the Pods are scheduled
// nodeSelector is optional.
//
// When omitted, this means the user has no opinion and the platform is left
// to choose reasonable defaults. These defaults are subject to change over time.
// The current default value is `kubernetes.io/os: linux`.
// When specified, resources must contain at least 1 entry and must not contain more than 10 entries.
// +optional
// +kubebuilder:validation:MinProperties=1
// +kubebuilder:validation:MaxProperties=10
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// resources defines the compute resource requests and limits for the KubeStateMetrics container.
// This includes CPU, memory and HugePages constraints to help control scheduling and resource usage.
// When not specified, defaults are used by the platform. Requests cannot exceed limits.
// This field is optional.
// More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
// This is a simplified API that maps to Kubernetes ResourceRequirements.
// The current default values are:
// resources:
// - name: cpu
// request: 4m
// limit: null
// - name: memory
// request: 40Mi
// limit: null
// When specified, resources must contain at least 1 entry and must not contain more than 10 entries.
// +optional
// +listType=map
// +listMapKey=name
// +kubebuilder:validation:MaxItems=10
// +kubebuilder:validation:MinItems=1
Resources []ContainerResource `json:"resources,omitempty"`
// tolerations defines tolerations for the pods.
// tolerations is optional.
//
// When omitted, this means the user has no opinion and the platform is left
// to choose reasonable defaults. These defaults are subject to change over time.
// Defaults are empty/unset.
// Maximum length for this list is 10
// Minimum length for this list is 1
// +kubebuilder:validation:MaxItems=10
// +kubebuilder:validation:MinItems=1
// +listType=atomic
// +optional
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
// topologySpreadConstraints defines rules for how Prometheus Operator Pods should be distributed
// across topology domains such as zones, nodes, or other user-defined labels.
// topologySpreadConstraints is optional.
// This helps improve high availability and resource efficiency by avoiding placing
// too many replicas in the same failure domain.
//
// When omitted, this means no opinion and the platform is left to choose a default, which is subject to change over time.
// This field maps directly to the `topologySpreadConstraints` field in the Pod spec.
// Default is empty list.
// Maximum length for this list is 10.
// Minimum length for this list is 1
// Entries must have unique topologyKey and whenUnsatisfiable pairs.
// +kubebuilder:validation:MaxItems=10
// +kubebuilder:validation:MinItems=1
// +listType=map
// +listMapKey=topologyKey
// +listMapKey=whenUnsatisfiable
// +optional
TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
}

// AuditProfile defines the audit log level for the Metrics Server.
// +kubebuilder:validation:Enum=None;Metadata;Request;RequestResponse
type AuditProfile string
Expand Down
Loading