Skip to content

Add metrics for managed resources count #4031

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .go-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.22.8
1.22.11
3 changes: 2 additions & 1 deletion docs/install/iam_policy.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
"elasticloadbalancing:DescribeTags",
"elasticloadbalancing:DescribeTrustStores",
"elasticloadbalancing:DescribeListenerAttributes",
"elasticloadbalancing:DescribeCapacityReservation"
"elasticloadbalancing:DescribeCapacityReservation",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
3 changes: 2 additions & 1 deletion docs/install/iam_policy_cn.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
"elasticloadbalancing:DescribeTags",
"elasticloadbalancing:DescribeTrustStores",
"elasticloadbalancing:DescribeListenerAttributes",
"elasticloadbalancing:DescribeCapacityReservation"
"elasticloadbalancing:DescribeCapacityReservation",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
3 changes: 2 additions & 1 deletion docs/install/iam_policy_iso.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
"elasticloadbalancing:DescribeTargetGroups",
"elasticloadbalancing:DescribeTargetGroupAttributes",
"elasticloadbalancing:DescribeTargetHealth",
"elasticloadbalancing:DescribeTags"
"elasticloadbalancing:DescribeTags",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
3 changes: 2 additions & 1 deletion docs/install/iam_policy_isob.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
"elasticloadbalancing:DescribeTargetGroups",
"elasticloadbalancing:DescribeTargetGroupAttributes",
"elasticloadbalancing:DescribeTargetHealth",
"elasticloadbalancing:DescribeTags"
"elasticloadbalancing:DescribeTags",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
3 changes: 2 additions & 1 deletion docs/install/iam_policy_isoe.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
"elasticloadbalancing:DescribeTargetGroups",
"elasticloadbalancing:DescribeTargetGroupAttributes",
"elasticloadbalancing:DescribeTargetHealth",
"elasticloadbalancing:DescribeTags"
"elasticloadbalancing:DescribeTags",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
3 changes: 2 additions & 1 deletion docs/install/iam_policy_isof.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
"elasticloadbalancing:DescribeTargetGroups",
"elasticloadbalancing:DescribeTargetGroupAttributes",
"elasticloadbalancing:DescribeTargetHealth",
"elasticloadbalancing:DescribeTags"
"elasticloadbalancing:DescribeTags",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
3 changes: 2 additions & 1 deletion docs/install/iam_policy_us-gov.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
"elasticloadbalancing:DescribeTags",
"elasticloadbalancing:DescribeTrustStores",
"elasticloadbalancing:DescribeListenerAttributes",
"elasticloadbalancing:DescribeCapacityReservation"
"elasticloadbalancing:DescribeCapacityReservation",
"tag:GetResources"
],
"Resource": "*"
},
Expand Down
44 changes: 37 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,24 @@ limitations under the License.
package main

import (
"k8s.io/client-go/util/workqueue"
"os"

elbv2deploy "sigs.k8s.io/aws-load-balancer-controller/pkg/deploy/elbv2"

"github.com/go-logr/logr"
"github.com/spf13/pflag"
zapraw "go.uber.org/zap"
k8sruntime "k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"os"
elbv2api "sigs.k8s.io/aws-load-balancer-controller/apis/elbv2/v1beta1"
elbv2controller "sigs.k8s.io/aws-load-balancer-controller/controllers/elbv2"
"sigs.k8s.io/aws-load-balancer-controller/controllers/ingress"
"sigs.k8s.io/aws-load-balancer-controller/controllers/service"
"sigs.k8s.io/aws-load-balancer-controller/pkg/aws"
"sigs.k8s.io/aws-load-balancer-controller/pkg/aws/throttle"
"sigs.k8s.io/aws-load-balancer-controller/pkg/config"
elbv2deploy "sigs.k8s.io/aws-load-balancer-controller/pkg/deploy/elbv2"
"sigs.k8s.io/aws-load-balancer-controller/pkg/inject"
"sigs.k8s.io/aws-load-balancer-controller/pkg/k8s"
awsmetrics "sigs.k8s.io/aws-load-balancer-controller/pkg/metrics/aws"
Expand All @@ -52,6 +50,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"time"
// +kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -84,8 +83,6 @@ func main() {
klog.SetLoggerWithOptions(appLogger, klog.ContextualLogger(true))

var awsMetricsCollector *awsmetrics.Collector
lbcMetricsCollector := lbcmetrics.NewCollector(metrics.Registry)

if metrics.Registry != nil {
awsMetricsCollector = awsmetrics.NewCollector(metrics.Registry)
}
Expand All @@ -107,6 +104,17 @@ func main() {
os.Exit(1)
}

// track the k8s resources with finalizers contains "k8s.aws"
// track the aws resources with cluster tag "elbv2.k8s.aws/cluster=$ClusterName"
lbcMetricsCollector := lbcmetrics.NewCollector(
metrics.Registry,
mgr.GetClient(),
cloud.RGT(),
"k8s.aws",
"elbv2.k8s.aws/cluster",
controllerCFG.ClusterName,
)

clientSet, err := kubernetes.NewForConfig(mgr.GetConfig())
if err != nil {
setupLog.Error(err, "unable to obtain clientSet")
Expand Down Expand Up @@ -202,6 +210,28 @@ func main() {
deferredTGBQueue.Run()
}()

// TODO: we can better improve this to update the metrics per reconcile
go func() {
ticker := time.NewTicker(2 * time.Minute)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not flexible and we don't be able to get real-time metrics..
i think we should trigger this when there are service/ingress/ingressGroup events happens.
for example, trigger a function in metricsCollector from within ingressGroupController.

e.g. to track the number of ALBs:
(note: just a naive thought, there could be edge cases like manually removed finalizer that shall be handled properly)

inject a lbMetricsCollector within ingressGroupController, when

  • when it reconciles an ingressGroup, it calls lbMetricsCollector.trackIngressGroup(group's name, maybe some other params necessary like active members)
  • after it successfully deleted ingressGroup, it calls lbMetricsCollector.untrackIngressGroup(group's name)

Then lbMetricsCollector shall have a correct view of number of currently managed ingressGroups at realtime.

defer ticker.Stop()

for {
select {
case <-ticker.C:
setupLog.Info("updating managed resource metrics")
if err := lbcMetricsCollector.UpdateManagedK8sResourceMetrics(ctx); err != nil {
setupLog.Error(err, "failed to update managed Kubernetes resource metrics")
}
if err := lbcMetricsCollector.UpdateManagedALBMetrics(ctx); err != nil {
setupLog.Error(err, "failed to update managed ALB metrics")
}
if err := lbcMetricsCollector.UpdateManagedNLBMetrics(ctx); err != nil {
setupLog.Error(err, "failed to update managed NLB metrics")
}
}
}
}()

if err := podInfoRepo.WaitForCacheSync(ctx); err != nil {
setupLog.Error(err, "problem wait for podInfo repo sync")
os.Exit(1)
Expand Down
9 changes: 7 additions & 2 deletions pkg/metrics/aws/instruments.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,13 @@ func newInstruments(registerer prometheus.Registerer) *instruments {
Name: metricAPIRequestDurationSeconds,
Help: "Latency of an individual HTTP request to the service endpoint",
}, []string{labelService, labelOperation})

registerer.MustRegister(apiCallsTotal, apiCallDurationSeconds, apiCallRetries, apiRequestsTotal, apiRequestDurationSecond)
registerer.MustRegister(
apiCallsTotal,
apiCallDurationSeconds,
apiCallRetries,
apiRequestsTotal,
apiRequestDurationSecond,
)
return &instruments{
apiCallsTotal: apiCallsTotal,
apiCallDurationSeconds: apiCallDurationSeconds,
Expand Down
152 changes: 148 additions & 4 deletions pkg/metrics/lbc/collector.go
Original file line number Diff line number Diff line change
@@ -1,33 +1,81 @@
package lbc

import (
"context"
awssdk "github.com/aws/aws-sdk-go-v2/aws"
rgtsdk "github.com/aws/aws-sdk-go-v2/service/resourcegroupstaggingapi"
rgttypes "github.com/aws/aws-sdk-go-v2/service/resourcegroupstaggingapi/types"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
elbv2api "sigs.k8s.io/aws-load-balancer-controller/apis/elbv2/v1beta1"
"sigs.k8s.io/aws-load-balancer-controller/pkg/aws/services"
"strings"

"sigs.k8s.io/controller-runtime/pkg/client"
"time"
)

const (
networkLoadBalancerStr = "nlb"
resourceTypeALB = "elasticloadbalancing:loadbalancer/app"
resourceTypeNLB = "elasticloadbalancing:loadbalancer/net"
)

type MetricCollector interface {
// ObservePodReadinessGateReady this metric is useful to determine how fast pods are becoming ready in the load balancer.
// Due to some architectural constraints, we can only emit this metric for pods that are using readiness gates.
ObservePodReadinessGateReady(namespace string, tgbName string, duration time.Duration)

// UpdateManagedK8sResourceMetrics fetches and updates managed k8s resources metrics.
UpdateManagedK8sResourceMetrics(ctx context.Context) error

// UpdateManagedALBMetrics updates managed ALB count metrics
UpdateManagedALBMetrics(ctx context.Context) error

//UpdateManagedNLBMetrics updates managed NLB count metrics
UpdateManagedNLBMetrics(ctx context.Context) error
}

type collector struct {
instruments *instruments
instruments *instruments
runtimeClient client.Client
rgt services.RGT
finalizerKeyWord string
clusterTagKey string
clusterTagVal string
}

type noOpCollector struct{}

func (n *noOpCollector) ObservePodReadinessGateReady(_ string, _ string, _ time.Duration) {
}

func NewCollector(registerer prometheus.Registerer) MetricCollector {
if registerer == nil {
func (n *noOpCollector) UpdateManagedK8sResourceMetrics(_ context.Context) error {
return nil
}

func (n *noOpCollector) UpdateManagedALBMetrics(_ context.Context) error {
return nil
}

func (n *noOpCollector) UpdateManagedNLBMetrics(_ context.Context) error {
return nil
}

func NewCollector(registerer prometheus.Registerer, runtimeClient client.Client, rgt services.RGT, finalizerKeyWord string, clusterTagKey string, clusterTagVal string) MetricCollector {
if registerer == nil || runtimeClient == nil {
return &noOpCollector{}
}

instruments := newInstruments(registerer)
return &collector{
instruments: instruments,
instruments: instruments,
runtimeClient: runtimeClient,
rgt: rgt,
finalizerKeyWord: finalizerKeyWord,
clusterTagKey: clusterTagKey,
clusterTagVal: clusterTagVal,
}
}

Expand All @@ -37,3 +85,99 @@ func (c *collector) ObservePodReadinessGateReady(namespace string, tgbName strin
labelName: tgbName,
}).Observe(duration.Seconds())
}

func (c *collector) UpdateManagedK8sResourceMetrics(ctx context.Context) error {
listOpts := &client.ListOptions{
Namespace: "",
}
ingressCount, serviceCount, tgbCount := 0, 0, 0
// Fetch ingress count
ingressList := &networkingv1.IngressList{}
err := c.runtimeClient.List(ctx, ingressList, listOpts)
if err != nil {
return err
}
for _, ingress := range ingressList.Items {
for _, finalizer := range ingress.Finalizers {
if strings.Contains(finalizer, c.finalizerKeyWord) {
ingressCount++
break
}
}
}
c.instruments.managedIngressCount.Set(float64(ingressCount))

// Fetch service count
serviceList := &corev1.ServiceList{}
err = c.runtimeClient.List(ctx, serviceList, listOpts)
if err != nil {
return err
}
for _, service := range serviceList.Items {
hasMatchingFinalizer := false
for _, finalizer := range service.Finalizers {
if strings.Contains(finalizer, c.finalizerKeyWord) {
hasMatchingFinalizer = true
break
}
}

if hasMatchingFinalizer && service.Spec.LoadBalancerClass != nil && strings.Contains(*service.Spec.LoadBalancerClass, networkLoadBalancerStr) {
serviceCount++
}
}
c.instruments.managedServiceCount.Set(float64(serviceCount))

// Fetch TargetGroupBinding count
tgbList := &elbv2api.TargetGroupBindingList{}
err = c.runtimeClient.List(ctx, tgbList, listOpts)
if err != nil {
return err
}
for _, tgb := range tgbList.Items {
for _, finalizer := range tgb.Finalizers {
if strings.Contains(finalizer, c.finalizerKeyWord) {
tgbCount++
break
}
}
}
c.instruments.managedTGBCount.Set(float64(tgbCount))

return nil
}

func (c *collector) UpdateManagedALBMetrics(ctx context.Context) error {
count, err := c.getManagedAWSResourceMetrics(ctx, resourceTypeALB)
if err != nil {
return err
}
c.instruments.managedALBCount.Set(float64(count))
return nil
}

func (c *collector) UpdateManagedNLBMetrics(ctx context.Context) error {
count, err := c.getManagedAWSResourceMetrics(ctx, resourceTypeNLB)
if err != nil {
return err
}
c.instruments.managedNLBCount.Set(float64(count))
return nil
}

func (c *collector) getManagedAWSResourceMetrics(ctx context.Context, resourceType string) (count int, err error) {
req := &rgtsdk.GetResourcesInput{
ResourceTypeFilters: []string{resourceType},
TagFilters: []rgttypes.TagFilter{
{
Key: awssdk.String(c.clusterTagKey),
Values: []string{c.clusterTagVal},
},
},
}
resources, err := c.rgt.GetResourcesAsList(ctx, req)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should not call AWS APIs to get those counter metrics. This can cause significant performance impact when there are large amount of LBs.
It shall be technical possible to get the number of LBs managed by the controller without using k8s apis.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I used RGT API call, so it's just 1 api call every 2min. But I do agree it's better to update the metrics per CRUD event. let me double check and get back.

if err != nil {
return 0, err
}
return len(resources), nil
}
Loading
Loading