Skip to content

Commit c25eca9

Browse files
authored
fix: add annotation for tf workload, refine CRD schema, add gpu allocation metrics (#181)
* fix: add annotation for tf workload, add autoscaling comments, update data structure * fix: add new annotations for workload, change auto scale definitions * feat: add GPU allocation metrics and update todo items * fix: use k8s node name for gpu allocation metrics * fix: use `_` for `used-by` label in tf workload custom resource * chore: lint issue
1 parent b003db7 commit c25eca9

18 files changed

+458
-785
lines changed

api/v1/gpupool_types.go

+3-6
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,6 @@ type GPUPoolSpec struct {
4242
// +optional
4343
ComponentConfig *ComponentConfig `json:"componentConfig,omitempty"`
4444

45-
// +optional
46-
SchedulingConfig *SchedulingConfigTemplateSpec `json:"schedulingConfig,omitempty"`
47-
4845
// +optional
4946
SchedulingConfigTemplate *string `json:"schedulingConfigTemplate,omitempty"`
5047
}
@@ -385,15 +382,15 @@ type GPUPoolStatus struct {
385382
// when the progress is 100, the component version or config is fully updated.
386383
ComponentStatus PoolComponentStatus `json:"componentStatus"`
387384

388-
// calculated every 5m average
385+
// TODO: calculated every 1h/1d/1w average
389386
UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
390387
UtilizedVRAMPercent string `json:"utilizedVRAMPercent,omitempty"`
391388

392-
// updated with interval
389+
// TODO: updated with interval
393390
AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
394391
AllocatedVRAMPercent string `json:"allocatedVRAMPercent,omitempty"`
395392

396-
// aggregated with interval
393+
// TODO: aggregated with interval
397394
SavedCostsPerMonth string `json:"savedCostsPerMonth,omitempty"`
398395
PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
399396

api/v1/schedulingconfigtemplate_types.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,19 @@ type GPUFilter struct {
8787

8888
type AutoScalingConfig struct {
8989
// layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
90+
// VPA-like, aggregate metrics data <1m
9091
AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"`
9192

9293
// layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
94+
// HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
9395
AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"`
9496

9597
// layer 3 adjusting, to match the actual usage in the long run
98+
// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
9699
AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"`
97100

98101
// additional layer to save VRAM, auto-freeze memory and cool down to RAM and Disk
102+
// Hypervisor will monitor and trigger freeze of inactive workers, Operator should mark them as scaled-to-zero and release the GPU pool resources, don't scale down CPU client part, so that they can continue to serve the traffic or scale down by other auto-scaling solutions like KEDA/KNative
99103
ScaleToZero ScaleToZero `json:"scaleToZero,omitempty"`
100104
}
101105

@@ -110,6 +114,11 @@ type AutoScalingConfig struct {
110114
// if AI prediction enabled, it helps to detect history pattern, and set more reasonable, explainable limit value
111115
// the final set limits should be max(finalPreferredLimits, last(predict_value * (1 + extraTFlopsBufferRatio)))
112116
type AutoSetLimits struct {
117+
Enable bool `json:"enable,omitempty"`
118+
119+
// target resource to scale limits, such as "tflops", "vram", or "all" by default
120+
TargetResource string `json:"targetResource,omitempty"`
121+
113122
EvaluationPeriod string `json:"evaluationPeriod,omitempty"`
114123

115124
ExtraTFlopsBufferRatio string `json:"extraTFlopsBufferRatio,omitempty"`
@@ -126,7 +135,7 @@ type AutoSetLimits struct {
126135

127136
// To handle burst traffic, scale up in short time (this feature requires GPU context migration & replication, not available yet)
128137
type AutoSetReplicas struct {
129-
Enable *bool `json:"enable,omitempty"`
138+
Enable bool `json:"enable,omitempty"`
130139
TargetTFlopsOfLimits string `json:"targetTFlopsOfLimits,omitempty"`
131140
EvaluationPeriod string `json:"evaluationPeriod,omitempty"`
132141
ScaleUpStep string `json:"scaleUpStep,omitempty"`
@@ -136,6 +145,11 @@ type AutoSetReplicas struct {
136145
}
137146

138147
type AutoSetRequests struct {
148+
Enable bool `json:"enable,omitempty"`
149+
150+
// target resource to scale requests, such as "tflops", "vram", or "all" by default
151+
TargetResource string `json:"targetResource,omitempty"`
152+
139153
PercentileForAutoRequests string `json:"percentileForAutoRequests,omitempty"`
140154

141155
// the request buffer ratio, for example actual usage is 1.0, 10% buffer will be 1.1 as final preferred requests

api/v1/tensorfusioncluster_types.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,15 @@ type TensorFusionClusterStatus struct {
7373
//
7474
RetryCount int64 `json:"retryCount"`
7575

76-
// calculated every 5m average
76+
// TODO: calculated every 1h/1d/1w average
7777
UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
7878
UtilizedVRAMPercent string `json:"utilizedVRAMPercent,omitempty"`
7979

80-
// updated with interval
80+
// TODO: updated with interval
8181
AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
8282
AllocatedVRAMPercent string `json:"allocatedVRAMPercent,omitempty"`
8383

84-
// aggregated with interval
84+
// TODO: aggregated with interval
8585
SavedCostsPerMonth string `json:"savedCostsPerMonth,omitempty"`
8686
PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
8787

api/v1/workloadprofile_types.go

+19-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,25 @@ type WorkloadProfileSpec struct {
4242
// Qos defines the quality of service level for the client.
4343
Qos QoSLevel `json:"qos,omitempty"`
4444

45-
IsLocalGPU bool `json:"isLocalGPU"`
45+
// +optional
46+
// Schedule the workload to the same GPU server that runs vGPU worker for best performance, default to false
47+
IsLocalGPU bool `json:"isLocalGPU,omitempty"`
48+
49+
// +optional
50+
// TODO, not implemented
51+
// The number of GPUs to be used by the workload, default to 1
52+
GPUCount int `json:"gpuCount,omitempty"`
53+
54+
// +optional
55+
// TODO, not implemented
56+
// This mode is only available when `is-local-gpu` set to true, in this mode, TensorFusion will also inject vGPU worker into init container, so that to achieve best performance, trade-off is user might by-pass the vGPU worker and using physical GPU directly
57+
NoStandaloneWorkerMode bool `json:"noStandaloneWorkerMode,omitempty"`
58+
59+
// +optional
60+
// AutoScalingConfig configured here will override Pool's schedulingConfig
61+
// This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
62+
// user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
63+
AutoScalingConfig AutoScalingConfig `json:"autoScalingConfig,omitempty"`
4664
}
4765

4866
// WorkloadProfileStatus defines the observed state of WorkloadProfile.

api/v1/zz_generated.deepcopy.go

+2-11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

-181
Original file line numberDiff line numberDiff line change
@@ -570,187 +570,6 @@ spec:
570570
type: object
571571
type: array
572572
type: object
573-
schedulingConfig:
574-
description: Place the workload to right nodes and scale smart.
575-
properties:
576-
autoScaling:
577-
description: scale the workload based on the usage and traffic
578-
properties:
579-
autoSetLimits:
580-
description: layer 1 vertical auto-scaling, turbo burst to
581-
existing GPU cards quickly
582-
properties:
583-
evaluationPeriod:
584-
type: string
585-
extraTFlopsBufferRatio:
586-
type: string
587-
ignoredDeltaRange:
588-
type: string
589-
maxRatioToRequests:
590-
description: the multiplier of requests, to avoid limit
591-
set too high, like 5.0
592-
type: string
593-
prediction:
594-
properties:
595-
enable:
596-
type: boolean
597-
historyDataPeriod:
598-
type: string
599-
model:
600-
type: string
601-
predictionPeriod:
602-
type: string
603-
type: object
604-
scaleUpStep:
605-
type: string
606-
type: object
607-
autoSetReplicas:
608-
description: layer 2 horizontal auto-scaling, scale up to
609-
more GPU cards if max limits threshold hit
610-
properties:
611-
enable:
612-
type: boolean
613-
evaluationPeriod:
614-
type: string
615-
scaleDownCoolDownTime:
616-
type: string
617-
scaleDownStep:
618-
type: string
619-
scaleUpCoolDownTime:
620-
type: string
621-
scaleUpStep:
622-
type: string
623-
targetTFlopsOfLimits:
624-
type: string
625-
type: object
626-
autoSetRequests:
627-
description: layer 3 adjusting, to match the actual usage
628-
in the long run
629-
properties:
630-
aggregationPeriod:
631-
type: string
632-
evaluationPeriod:
633-
type: string
634-
extraBufferRatio:
635-
description: the request buffer ratio, for example actual
636-
usage is 1.0, 10% buffer will be 1.1 as final preferred
637-
requests
638-
type: string
639-
percentileForAutoRequests:
640-
type: string
641-
prediction:
642-
properties:
643-
enable:
644-
type: boolean
645-
historyDataPeriod:
646-
type: string
647-
model:
648-
type: string
649-
predictionPeriod:
650-
type: string
651-
type: object
652-
type: object
653-
scaleToZero:
654-
description: additional layer to save VRAM, auto-freeze memory
655-
and cool down to RAM and Disk
656-
properties:
657-
autoFreeze:
658-
items:
659-
properties:
660-
enable:
661-
type: boolean
662-
freezeToDiskTTL:
663-
type: string
664-
freezeToMemTTL:
665-
type: string
666-
qos:
667-
enum:
668-
- low
669-
- medium
670-
- high
671-
- critical
672-
type: string
673-
type: object
674-
type: array
675-
intelligenceWarmup:
676-
properties:
677-
enable:
678-
type: boolean
679-
historyDataPeriod:
680-
type: string
681-
model:
682-
type: string
683-
predictionPeriod:
684-
type: string
685-
type: object
686-
type: object
687-
type: object
688-
hypervisor:
689-
description: single GPU device multi-process queuing and fair
690-
scheduling with QoS constraint
691-
properties:
692-
multiProcessQueuing:
693-
properties:
694-
enable:
695-
type: boolean
696-
interval:
697-
type: string
698-
queueLevelTimeSlices:
699-
items:
700-
type: string
701-
type: array
702-
type: object
703-
type: object
704-
placement:
705-
description: place the client or worker to best matched nodes
706-
properties:
707-
allowUsingLocalGPU:
708-
default: true
709-
type: boolean
710-
gpuFilters:
711-
items:
712-
description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n-
713-
type: avoidTooMuchConnectionsOnSameGPU\nparams:\n\n\tconnectionNum:
714-
150\n\n- type: avoidDifferentZone\nparams:\n\n\t# by default,
715-
GPU worker will be scheduled into the same zone as CPU
716-
Client Pod to align AZ and improve performance\n\ttopologyKey:
717-
topology.kubernetes.io/zone\n\n```"
718-
properties:
719-
params:
720-
type: object
721-
x-kubernetes-preserve-unknown-fields: true
722-
type:
723-
type: string
724-
type: object
725-
type: array
726-
mode:
727-
default: CompactFirst
728-
enum:
729-
- CompactFirst
730-
- LowLoadFirst
731-
type: string
732-
required:
733-
- mode
734-
type: object
735-
reBalancer:
736-
description: |-
737-
avoid hot GPU devices and continuously balance the workload
738-
implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
739-
properties:
740-
internal:
741-
type: string
742-
reBalanceCoolDownTime:
743-
type: string
744-
threshold:
745-
properties:
746-
matchAny:
747-
type: object
748-
x-kubernetes-preserve-unknown-fields: true
749-
type: object
750-
type: object
751-
required:
752-
- placement
753-
type: object
754573
schedulingConfigTemplate:
755574
type: string
756575
type: object

0 commit comments

Comments
 (0)