NexusGPU · Code2Life · Feb 16, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -14,6 +14,17 @@
             },
             "program": "${workspaceFolder}/cmd/operator/main.go",
         },
+        {
+            "name": "Debug Discovery",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "env": {
+                "HOSTNAME": "mocknode",
+                "KUBECONFIG": "~/.kube/config",
+            },
+            "program": "${workspaceFolder}/cmd/nodediscovery/main.go",
+        },
         {
             "name": "Debug Dev Env Operator",
             "type": "go",

diff --git a/api/v1/gpunode_funcs.go b/api/v1/gpunode_funcs.go
@@ -0,0 +1,27 @@
+package v1
+
+import (
+	"time"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+)
+
+func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, initGPUs int32) {
+	node.Status = GPUNodeStatus{
+		Phase:               TensorFusionGPUNodePhasePending,
+		TotalTFlops:         initTFlops,
+		TotalVRAM:           initVRAM,
+		TotalGPUs:           initGPUs,
+		AllocationDetails:   &[]GPUNodeAllocationDetails{},
+		LoadedModels:        &[]string{},
+		ManagedGPUDeviceIDs: []string{},
+		ObservedGeneration:  node.Generation,
+	}
+}
+
+func (node *GPUNode) SetAnnotationToTriggerNodeSync() {
+	if node.Annotations == nil {
+		node.Annotations = make(map[string]string)
+	}
+	node.Annotations["tensor-fusion.ai/refresh-node-state"] = time.Now().String()
+}
diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
@@ -57,6 +57,7 @@ type GPUNodeStatus struct {
 	// +kubebuilder:default=Pending
 	Phase TensorFusionGPUNodePhase `json:"phase"`
 
+	// +optional
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
 	TotalTFlops resource.Quantity `json:"totalTFlops"`
@@ -68,20 +69,26 @@ type GPUNodeStatus struct {
 	AvailableTFlops resource.Quantity `json:"availableTFlops"`
 	AvailableVRAM   resource.Quantity `json:"availableVRAM"`
 
+	// +optional
 	HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`
 
+	// +optional
 	NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"`
 
-	LoadedModels []string `json:"loadedModels"`
+	// +optional
+	LoadedModels *[]string `json:"loadedModels,omitempty"`
 
-	TotalGPUs           int32    `json:"totalGPUs"`
-	ManagedGPUs         int32    `json:"managedGPUs"`
+	TotalGPUs   int32 `json:"totalGPUs"`
+	ManagedGPUs int32 `json:"managedGPUs"`
+
+	// +optional
 	ManagedGPUDeviceIDs []string `json:"managedGPUDeviceIDs,omitempty"`
 
 	ObservedGeneration int64 `json:"observedGeneration,omitempty"`
 
 	// Allocation details is for node compaction, and calculate used apps
-	AllocationDetails []GPUNodeAllocationDetails `json:"allocationDetails"`
+	// +optional
+	AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
 }
 
 type GPUNodeAllocationDetails struct {

diff --git a/api/v1/tensorfusioncluster_types.go b/api/v1/tensorfusioncluster_types.go
@@ -126,7 +126,7 @@ const (
 	AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
 )
 
-// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;aliyun;nvidia;tencent;runpod;mock
+// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;mock
 type ComputingVendorName string
 
 const (
@@ -139,7 +139,7 @@ const (
 	ComputingVendorVultr      ComputingVendorName = "vultr"
 	ComputingVendorTogetherAI ComputingVendorName = "together-ai"
 	ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs"
-	ComputingVendorAliyun     ComputingVendorName = "aliyun"
+	ComputingVendorAlibaba    ComputingVendorName = "alibaba"
 	ComputingVendorNvidia     ComputingVendorName = "nvidia"
 	ComputingVendorTencent    ComputingVendorName = "tencent"
 	ComputingVendorRunPod     ComputingVendorName = "runpod"

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.1.4
+version: 1.2.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
@@ -318,11 +318,9 @@ spec:
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
             required:
-            - allocationDetails
             - availableTFlops
             - availableVRAM
             - kubernetesNodeName
-            - loadedModels
             - managedGPUs
             - phase
             - totalGPUs

diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -115,7 +115,7 @@ spec:
                     - openshift
                     - vultr
                     - together-ai
-                    - aliyun
+                    - alibaba
                     - nvidia
                     - tencent
                     - runpod

diff --git a/charts/tensor-fusion/templates/controller-deployment.yaml b/charts/tensor-fusion/templates/controller-deployment.yaml
@@ -54,6 +54,9 @@ spec:
             - name: cert
               readOnly: true
               mountPath: /tmp/k8s-webhook-server/serving-certs
+            - name: cloud-vendor-credentials
+              mountPath: /tmp/secret
+              readOnly: true
         {{- if .Values.agent.agentId }}
         - name: cluster-agent
           image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
@@ -98,6 +101,10 @@ spec:
           configMap:
             name: {{ include "tensor-fusion.fullname" . }}-vector-config
             defaultMode: 420
+        - name: cloud-vendor-credentials
+          secret:
+            secretName: tf-cloud-vendor-credentials
+            defaultMode: 420
         - name: logs
           emptyDir: {}
       {{- with .Values.controller.affinity }}

diff --git a/charts/tensor-fusion/templates/vector.yaml → ...ensor-fusion/templates/vector-config.yaml b/charts/tensor-fusion/templates/vector.yaml → ...ensor-fusion/templates/vector-config.yaml
diff --git a/charts/tensor-fusion/templates/vendor-credentials.yaml b/charts/tensor-fusion/templates/vendor-credentials.yaml
@@ -0,0 +1,12 @@
+{{- if not (lookup "v1" "Secret" .Release.Namespace "tf-cloud-vendor-credentials") }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: tf-cloud-vendor-credentials
+  namespace: {{ .Release.Namespace }}
+  annotations:
+    helm.sh/resource-policy: keep
+data:
+  ak: "{{ .Values.cloudVendorCredentials.accessKey | base64encode }}"
+  sk: "{{ .Values.cloudVendorCredentials.secretKey | base64encode }}"
+{{- end }}
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -74,4 +74,9 @@ agent:
       memory: 64Mi
     limits:
       cpu: 1000m
-      memory: 512Mi
+      memory: 512Mi
+
+# Only needed if your pool is running in Provisioned mode, and the cloud vendor doesn't support IRSA or any serviceAccount like zero-credential Auth approaches
+cloudVendorCredentials:
+  accessKey: "dummy"
+  secretKey: "dummy"
-Original file line number
+Diff line change
@@ Expand Up / @@ -115,7 +115,7 @@ spec: @@
                         - openshift
                         - vultr
                         - together-ai
-                        - aliyun
+                        - alibaba
                         - nvidia
                         - tencent
                         - runpod
@@ Expand Down @@