NexusGPU
diff --git a/‎api/v1/gpupool_types.go
+3-6 b/‎api/v1/gpupool_types.go
+3-6
diff --git a/‎api/v1/schedulingconfigtemplate_types.go
+15-1 b/‎api/v1/schedulingconfigtemplate_types.go
+15-1
diff --git a/‎api/v1/tensorfusioncluster_types.go
+3-3 b/‎api/v1/tensorfusioncluster_types.go
+3-3
diff --git a/‎api/v1/workloadprofile_types.go
+19-1 b/‎api/v1/workloadprofile_types.go
+19-1
diff --git a/‎api/v1/zz_generated.deepcopy.go
+2-11 b/‎api/v1/zz_generated.deepcopy.go
+2-11
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
-181 b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
-181
@@ -42,9 +42,6 @@ type GPUPoolSpec struct {
 	// +optional
 	ComponentConfig *ComponentConfig `json:"componentConfig,omitempty"`
 
-	// +optional
-	SchedulingConfig *SchedulingConfigTemplateSpec `json:"schedulingConfig,omitempty"`
-
 	// +optional
 	SchedulingConfigTemplate *string `json:"schedulingConfigTemplate,omitempty"`
 }
@@ -385,15 +382,15 @@ type GPUPoolStatus struct {
 	// when the progress is 100, the component version or config is fully updated.
 	ComponentStatus PoolComponentStatus `json:"componentStatus"`
 
-	// calculated every 5m average
+	// TODO: calculated every 1h/1d/1w average
 	UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
 	UtilizedVRAMPercent   string `json:"utilizedVRAMPercent,omitempty"`
 
-	// updated with interval
+	// TODO: updated with interval
 	AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
 	AllocatedVRAMPercent   string `json:"allocatedVRAMPercent,omitempty"`
 
-	// aggregated with interval
+	// TODO: aggregated with interval
 	SavedCostsPerMonth       string `json:"savedCostsPerMonth,omitempty"`
 	PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
 
 
@@ -87,15 +87,19 @@ type GPUFilter struct {
 
 type AutoScalingConfig struct {
 	// layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
+	// VPA-like, aggregate metrics data <1m
 	AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"`
 
 	// layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
+	// HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
 	AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"`
 
 	// layer 3 adjusting, to match the actual usage in the long run
+	// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
 	AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"`
 
 	// additional layer to save VRAM, auto-freeze memory and cool down to RAM and Disk
+	// Hypervisor will monitor and trigger freeze of inactive workers, Operator should mark them as scaled-to-zero and release the GPU pool resources, don't scale down CPU client part, so that they can continue to serve the traffic or scale down by other auto-scaling solutions like KEDA/KNative
 	ScaleToZero ScaleToZero `json:"scaleToZero,omitempty"`
 }
 
@@ -110,6 +114,11 @@ type AutoScalingConfig struct {
 // if AI prediction enabled, it helps to detect history pattern, and set more reasonable, explainable limit value
 // the final set limits should be max(finalPreferredLimits, last(predict_value * (1 + extraTFlopsBufferRatio)))
 type AutoSetLimits struct {
+	Enable bool `json:"enable,omitempty"`
+
+	// target resource to scale limits, such as "tflops", "vram", or "all" by default
+	TargetResource string `json:"targetResource,omitempty"`
+
 	EvaluationPeriod string `json:"evaluationPeriod,omitempty"`
 
 	ExtraTFlopsBufferRatio string `json:"extraTFlopsBufferRatio,omitempty"`
@@ -126,7 +135,7 @@ type AutoSetLimits struct {
 
 // To handle burst traffic, scale up in short time (this feature requires GPU context migration & replication, not available yet)
 type AutoSetReplicas struct {
-	Enable                *bool  `json:"enable,omitempty"`
+	Enable                bool   `json:"enable,omitempty"`
 	TargetTFlopsOfLimits  string `json:"targetTFlopsOfLimits,omitempty"`
 	EvaluationPeriod      string `json:"evaluationPeriod,omitempty"`
 	ScaleUpStep           string `json:"scaleUpStep,omitempty"`
@@ -136,6 +145,11 @@ type AutoSetReplicas struct {
 }
 
 type AutoSetRequests struct {
+	Enable bool `json:"enable,omitempty"`
+
+	// target resource to scale requests, such as "tflops", "vram", or "all" by default
+	TargetResource string `json:"targetResource,omitempty"`
+
 	PercentileForAutoRequests string `json:"percentileForAutoRequests,omitempty"`
 
 	// the request buffer ratio, for example actual usage is 1.0, 10% buffer will be 1.1 as final preferred requests
 
@@ -73,15 +73,15 @@ type TensorFusionClusterStatus struct {
 	//
 	RetryCount int64 `json:"retryCount"`
 
-	// calculated every 5m average
+	// TODO: calculated every 1h/1d/1w average
 	UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
 	UtilizedVRAMPercent   string `json:"utilizedVRAMPercent,omitempty"`
 
-	// updated with interval
+	// TODO: updated with interval
 	AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
 	AllocatedVRAMPercent   string `json:"allocatedVRAMPercent,omitempty"`
 
-	// aggregated with interval
+	// TODO: aggregated with interval
 	SavedCostsPerMonth       string `json:"savedCostsPerMonth,omitempty"`
 	PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
 
 
@@ -42,7 +42,25 @@ type WorkloadProfileSpec struct {
 	// Qos defines the quality of service level for the client.
 	Qos QoSLevel `json:"qos,omitempty"`
 
-	IsLocalGPU bool `json:"isLocalGPU"`
+	// +optional
+	// Schedule the workload to the same GPU server that runs vGPU worker for best performance, default to false
+	IsLocalGPU bool `json:"isLocalGPU,omitempty"`
+
+	// +optional
+	// TODO, not implemented
+	// The number of GPUs to be used by the workload, default to 1
+	GPUCount int `json:"gpuCount,omitempty"`
+
+	// +optional
+	// TODO, not implemented
+	// This mode is only available when `is-local-gpu` set to true, in this mode, TensorFusion will also inject vGPU worker into init container, so that to achieve best performance, trade-off is user might by-pass the vGPU worker and using physical GPU directly
+	NoStandaloneWorkerMode bool `json:"noStandaloneWorkerMode,omitempty"`
+
+	// +optional
+	// AutoScalingConfig configured here will override Pool's schedulingConfig
+	// This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
+	// user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
+	AutoScalingConfig AutoScalingConfig `json:"autoScalingConfig,omitempty"`
 }
 
 // WorkloadProfileStatus defines the observed state of WorkloadProfile.
 
@@ -570,187 +570,6 @@ spec:
                       type: object
                     type: array
                 type: object
-              schedulingConfig:
-                description: Place the workload to right nodes and scale smart.
-                properties:
-                  autoScaling:
-                    description: scale the workload based on the usage and traffic
-                    properties:
-                      autoSetLimits:
-                        description: layer 1 vertical auto-scaling, turbo burst to
-                          existing GPU cards quickly
-                        properties:
-                          evaluationPeriod:
-                            type: string
-                          extraTFlopsBufferRatio:
-                            type: string
-                          ignoredDeltaRange:
-                            type: string
-                          maxRatioToRequests:
-                            description: the multiplier of requests, to avoid limit
-                              set too high, like 5.0
-                            type: string
-                          prediction:
-                            properties:
-                              enable:
-                                type: boolean
-                              historyDataPeriod:
-                                type: string
-                              model:
-                                type: string
-                              predictionPeriod:
-                                type: string
-                            type: object
-                          scaleUpStep:
-                            type: string
-                        type: object
-                      autoSetReplicas:
-                        description: layer 2 horizontal auto-scaling, scale up to
-                          more GPU cards if max limits threshold hit
-                        properties:
-                          enable:
-                            type: boolean
-                          evaluationPeriod:
-                            type: string
-                          scaleDownCoolDownTime:
-                            type: string
-                          scaleDownStep:
-                            type: string
-                          scaleUpCoolDownTime:
-                            type: string
-                          scaleUpStep:
-                            type: string
-                          targetTFlopsOfLimits:
-                            type: string
-                        type: object
-                      autoSetRequests:
-                        description: layer 3 adjusting, to match the actual usage
-                          in the long run
-                        properties:
-                          aggregationPeriod:
-                            type: string
-                          evaluationPeriod:
-                            type: string
-                          extraBufferRatio:
-                            description: the request buffer ratio, for example actual
-                              usage is 1.0, 10% buffer will be 1.1 as final preferred
-                              requests
-                            type: string
-                          percentileForAutoRequests:
-                            type: string
-                          prediction:
-                            properties:
-                              enable:
-                                type: boolean
-                              historyDataPeriod:
-                                type: string
-                              model:
-                                type: string
-                              predictionPeriod:
-                                type: string
-                            type: object
-                        type: object
-                      scaleToZero:
-                        description: additional layer to save VRAM, auto-freeze memory
-                          and cool down to RAM and Disk
-                        properties:
-                          autoFreeze:
-                            items:
-                              properties:
-                                enable:
-                                  type: boolean
-                                freezeToDiskTTL:
-                                  type: string
-                                freezeToMemTTL:
-                                  type: string
-                                qos:
-                                  enum:
-                                  - low
-                                  - medium
-                                  - high
-                                  - critical
-                                  type: string
-                              type: object
-                            type: array
-                          intelligenceWarmup:
-                            properties:
-                              enable:
-                                type: boolean
-                              historyDataPeriod:
-                                type: string
-                              model:
-                                type: string
-                              predictionPeriod:
-                                type: string
-                            type: object
-                        type: object
-                    type: object
-                  hypervisor:
-                    description: single GPU device multi-process queuing and fair
-                      scheduling with QoS constraint
-                    properties:
-                      multiProcessQueuing:
-                        properties:
-                          enable:
-                            type: boolean
-                          interval:
-                            type: string
-                          queueLevelTimeSlices:
-                            items:
-                              type: string
-                            type: array
-                        type: object
-                    type: object
-                  placement:
-                    description: place the client or worker to best matched nodes
-                    properties:
-                      allowUsingLocalGPU:
-                        default: true
-                        type: boolean
-                      gpuFilters:
-                        items:
-                          description: "GPUFilter is to select eligible GPUs for scheduling.\n\nexample:\n```yaml\n-
-                            type: avoidTooMuchConnectionsOnSameGPU\nparams:\n\n\tconnectionNum:
-                            150\n\n- type: avoidDifferentZone\nparams:\n\n\t# by default,
-                            GPU worker will be scheduled into the same zone as CPU
-                            Client Pod to align AZ and improve performance\n\ttopologyKey:
-                            topology.kubernetes.io/zone\n\n```"
-                          properties:
-                            params:
-                              type: object
-                              x-kubernetes-preserve-unknown-fields: true
-                            type:
-                              type: string
-                          type: object
-                        type: array
-                      mode:
-                        default: CompactFirst
-                        enum:
-                        - CompactFirst
-                        - LowLoadFirst
-                        type: string
-                    required:
-                    - mode
-                    type: object
-                  reBalancer:
-                    description: |-
-                      avoid hot GPU devices and continuously balance the workload
-                      implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
-                    properties:
-                      internal:
-                        type: string
-                      reBalanceCoolDownTime:
-                        type: string
-                      threshold:
-                        properties:
-                          matchAny:
-                            type: object
-                            x-kubernetes-preserve-unknown-fields: true
-                        type: object
-                    type: object
-                required:
-                - placement
-                type: object
               schedulingConfigTemplate:
                 type: string
             type: object