Skip to content

Commit d277621

Browse files
authored
Add cloudwatch statsd daemonset (#419)
1 parent 474bfac commit d277621

File tree

14 files changed

+188
-63
lines changed

14 files changed

+188
-63
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ ci-build-images:
146146
@./build/build-image.sh images/onnx-serve-gpu onnx-serve-gpu
147147
@./build/build-image.sh images/operator operator
148148
@./build/build-image.sh images/fluentd fluentd
149+
@./build/build-image.sh images/statsd statsd
149150
@./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
150151
@./build/build-image.sh images/nvidia nvidia
151152
@./build/build-image.sh images/metrics-server metrics-server
@@ -164,6 +165,7 @@ ci-push-images:
164165
@./build/push-image.sh onnx-serve-gpu
165166
@./build/push-image.sh operator
166167
@./build/push-image.sh fluentd
168+
@./build/push-image.sh statsd
167169
@./build/push-image.sh cluster-autoscaler
168170
@./build/push-image.sh nvidia
169171
@./build/push-image.sh metrics-server

cortex.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}"
168168

169169
export CORTEX_IMAGE_MANAGER="${CORTEX_IMAGE_MANAGER:-cortexlabs/manager:$CORTEX_VERSION_STABLE}"
170170
export CORTEX_IMAGE_FLUENTD="${CORTEX_IMAGE_FLUENTD:-cortexlabs/fluentd:$CORTEX_VERSION_STABLE}"
171+
export CORTEX_IMAGE_STATSD="${CORTEX_IMAGE_STATSD:-cortexlabs/statsd:$CORTEX_VERSION_STABLE}"
171172
export CORTEX_IMAGE_OPERATOR="${CORTEX_IMAGE_OPERATOR:-cortexlabs/operator:$CORTEX_VERSION_STABLE}"
172173
export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORTEX_VERSION_STABLE}"
173174
export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
@@ -227,6 +228,7 @@ function install_cortex() {
227228
-e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \
228229
-e CORTEX_BUCKET=$CORTEX_BUCKET \
229230
-e CORTEX_IMAGE_FLUENTD=$CORTEX_IMAGE_FLUENTD \
231+
-e CORTEX_IMAGE_STATSD=$CORTEX_IMAGE_STATSD \
230232
-e CORTEX_IMAGE_OPERATOR=$CORTEX_IMAGE_OPERATOR \
231233
-e CORTEX_IMAGE_TF_SERVE=$CORTEX_IMAGE_TF_SERVE \
232234
-e CORTEX_IMAGE_TF_API=$CORTEX_IMAGE_TF_API \

dev/registry.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ function ecr_login() {
3737
function create_registry() {
3838
aws ecr create-repository --repository-name=cortexlabs/manager --region=$REGISTRY_REGION || true
3939
aws ecr create-repository --repository-name=cortexlabs/fluentd --region=$REGISTRY_REGION || true
40+
aws ecr create-repository --repository-name=cortexlabs/statsd --region=$REGISTRY_REGION || true
4041
aws ecr create-repository --repository-name=cortexlabs/istio-citadel --region=$REGISTRY_REGION || true
4142
aws ecr create-repository --repository-name=cortexlabs/istio-pilot --region=$REGISTRY_REGION || true
4243
aws ecr create-repository --repository-name=cortexlabs/istio-galley --region=$REGISTRY_REGION || true
@@ -128,6 +129,7 @@ elif [ "$cmd" = "update" ]; then
128129
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
129130

130131
build_and_push $ROOT/images/fluentd fluentd latest
132+
build_and_push $ROOT/images/statsd statsd latest
131133
build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest
132134
build_and_push $ROOT/images/nvidia nvidia latest
133135
build_and_push $ROOT/images/metrics-server metrics-server latest

docs/cluster/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ export CORTEX_NAMESPACE="cortex"
4545
# Image paths
4646
export CORTEX_IMAGE_MANAGER="cortexlabs/manager:master"
4747
export CORTEX_IMAGE_FLUENTD="cortexlabs/fluentd:master"
48+
export CORTEX_IMAGE_STATSD="cortexlabs/statsd:master"
4849
export CORTEX_IMAGE_OPERATOR="cortexlabs/operator:master"
4950
export CORTEX_IMAGE_TF_SERVE="cortexlabs/tf-serve:master"
5051
export CORTEX_IMAGE_TF_API="cortexlabs/tf-api:master"

docs/development.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ export CORTEX_NAMESPACE="cortex"
6363

6464
export CORTEX_IMAGE_MANAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/manager:latest"
6565
export CORTEX_IMAGE_FLUENTD="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/fluentd:latest"
66+
export CORTEX_IMAGE_STATSD="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/statsd:latest"
6667
export CORTEX_IMAGE_ONNX_SERVE="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/onnx-serve:latest"
6768
export CORTEX_IMAGE_ONNX_SERVE_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/onnx-serve-gpu:latest"
6869
export CORTEX_IMAGE_OPERATOR="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/operator:latest"

images/statsd/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
FROM amazon/cloudwatch-agent:1.226589.0

manager/install_cortex.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null
200200
echo "✓ Configured logging"
201201

202202
envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
203+
envsubst < manifests/statsd.yaml | kubectl apply -f - >/dev/null
203204
echo "✓ Configured metrics"
204205

205206
if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then

manager/manifests/statsd.yaml

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright 2019 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
data:
17+
cwagentconfig.json: |
18+
{
19+
"agent": {
20+
"omit_hostname": true,
21+
"region": "$CORTEX_REGION"
22+
},
23+
"metrics": {
24+
"namespace": "$CORTEX_LOG_GROUP",
25+
"force_flush_interval": 1,
26+
"metrics_collected": {
27+
"statsd": {
28+
"service_address":":8125",
29+
"metrics_collection_interval": 1,
30+
"metrics_aggregation_interval": 1
31+
}
32+
}
33+
}
34+
}
35+
kind: ConfigMap
36+
metadata:
37+
name: cwagentstatsdconfig
38+
namespace: $CORTEX_NAMESPACE
39+
---
40+
apiVersion: apps/v1
41+
kind: DaemonSet
42+
metadata:
43+
name: cloudwatch-agent-statsd
44+
namespace: $CORTEX_NAMESPACE
45+
spec:
46+
selector:
47+
matchLabels:
48+
name: cloudwatch-agent-statsd
49+
template:
50+
metadata:
51+
labels:
52+
name: cloudwatch-agent-statsd
53+
spec:
54+
containers:
55+
- name: cloudwatch-agent
56+
image: $CORTEX_IMAGE_STATSD
57+
imagePullPolicy: Always
58+
ports:
59+
# containerPort should be consistent with the listen port defined in configmap
60+
- containerPort: 8125
61+
hostPort: 8125
62+
protocol: UDP
63+
resources:
64+
limits:
65+
cpu: 200m
66+
memory: 100Mi
67+
requests:
68+
cpu: 100m
69+
memory: 100Mi
70+
# Please don't change the env
71+
env:
72+
- name: HOST_NAME
73+
valueFrom:
74+
fieldRef:
75+
fieldPath: spec.nodeName
76+
- name: AWS_REGION
77+
value: $CORTEX_REGION
78+
- name: AWS_ACCESS_KEY_ID
79+
valueFrom:
80+
secretKeyRef:
81+
name: aws-credentials
82+
key: AWS_ACCESS_KEY_ID
83+
- name: AWS_SECRET_ACCESS_KEY
84+
valueFrom:
85+
secretKeyRef:
86+
name: aws-credentials
87+
key: AWS_SECRET_ACCESS_KEY
88+
# Please don't change the mountPath
89+
volumeMounts:
90+
- name: cwagentconfig
91+
mountPath: /etc/cwagentconfig
92+
volumes:
93+
- name: cwagentconfig
94+
configMap:
95+
name: cwagentstatsdconfig
96+
terminationGracePeriodSeconds: 60

manager/uninstall_operator.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ echo "Uninstalling the Cortex operator ..."
2525
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator >/dev/null 2>&1
2626

2727
# Pods in DaemonSets cannot be modified
28+
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset cloudwatch-agent-statsd >/dev/null 2>&1
2829
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1
2930
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset nvidia-device-plugin-daemonset >/dev/null 2>&1
3031

pkg/operator/workloads/api_workload.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,17 @@ func tfAPISpec(
330330
"--cache-dir=" + consts.ContextCacheDir,
331331
"--project-dir=" + path.Join(consts.EmptyDirMountPath, "project"),
332332
},
333-
Env: k8s.AWSCredentials(),
333+
Env: append(
334+
k8s.AWSCredentials(),
335+
kcore.EnvVar{
336+
Name: "HOST_IP",
337+
ValueFrom: &kcore.EnvVarSource{
338+
FieldRef: &kcore.ObjectFieldSelector{
339+
FieldPath: "status.hostIP",
340+
},
341+
},
342+
},
343+
),
334344
VolumeMounts: k8s.DefaultVolumeMounts(),
335345
ReadinessProbe: &kcore.Probe{
336346
InitialDelaySeconds: 5,
@@ -488,7 +498,17 @@ func onnxAPISpec(
488498
"--cache-dir=" + consts.ContextCacheDir,
489499
"--project-dir=" + path.Join(consts.EmptyDirMountPath, "project"),
490500
},
491-
Env: k8s.AWSCredentials(),
501+
Env: append(
502+
k8s.AWSCredentials(),
503+
kcore.EnvVar{
504+
Name: "HOST_IP",
505+
ValueFrom: &kcore.EnvVarSource{
506+
FieldRef: &kcore.ObjectFieldSelector{
507+
FieldPath: "status.hostIP",
508+
},
509+
},
510+
},
511+
),
492512
VolumeMounts: k8s.DefaultVolumeMounts(),
493513
ReadinessProbe: &kcore.Probe{
494514
InitialDelaySeconds: 5,

0 commit comments

Comments
 (0)