Skip to content

fix: pooling and cloud vendor provisioning mode bugs #42

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@
},
"program": "${workspaceFolder}/cmd/operator/main.go",
},
{
"name": "Debug Discovery",
"type": "go",
"request": "launch",
"mode": "auto",
"env": {
"HOSTNAME": "mocknode",
"KUBECONFIG": "~/.kube/config",
},
"program": "${workspaceFolder}/cmd/nodediscovery/main.go",
},
{
"name": "Debug Dev Env Operator",
"type": "go",
Expand Down
27 changes: 27 additions & 0 deletions api/v1/gpunode_funcs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package v1

import (
"time"

"k8s.io/apimachinery/pkg/api/resource"
)

func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, initGPUs int32) {
node.Status = GPUNodeStatus{
Phase: TensorFusionGPUNodePhasePending,
TotalTFlops: initTFlops,
TotalVRAM: initVRAM,
TotalGPUs: initGPUs,
AllocationDetails: &[]GPUNodeAllocationDetails{},
LoadedModels: &[]string{},
ManagedGPUDeviceIDs: []string{},
ObservedGeneration: node.Generation,
}
}

func (node *GPUNode) SetAnnotationToTriggerNodeSync() {
if node.Annotations == nil {
node.Annotations = make(map[string]string)
}
node.Annotations["tensor-fusion.ai/refresh-node-state"] = time.Now().String()
}
15 changes: 11 additions & 4 deletions api/v1/gpunode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type GPUNodeStatus struct {
// +kubebuilder:default=Pending
Phase TensorFusionGPUNodePhase `json:"phase"`

// +optional
Conditions []metav1.Condition `json:"conditions,omitempty"`

TotalTFlops resource.Quantity `json:"totalTFlops"`
Expand All @@ -68,20 +69,26 @@ type GPUNodeStatus struct {
AvailableTFlops resource.Quantity `json:"availableTFlops"`
AvailableVRAM resource.Quantity `json:"availableVRAM"`

// +optional
HypervisorStatus NodeHypervisorStatus `json:"hypervisorStatus,omitempty"`

// +optional
NodeInfo GPUNodeInfo `json:"nodeInfo,omitempty"`

LoadedModels []string `json:"loadedModels"`
// +optional
LoadedModels *[]string `json:"loadedModels,omitempty"`

TotalGPUs int32 `json:"totalGPUs"`
ManagedGPUs int32 `json:"managedGPUs"`
TotalGPUs int32 `json:"totalGPUs"`
ManagedGPUs int32 `json:"managedGPUs"`

// +optional
ManagedGPUDeviceIDs []string `json:"managedGPUDeviceIDs,omitempty"`

ObservedGeneration int64 `json:"observedGeneration,omitempty"`

// Allocation details is for node compaction, and calculate used apps
AllocationDetails []GPUNodeAllocationDetails `json:"allocationDetails"`
// +optional
AllocationDetails *[]GPUNodeAllocationDetails `json:"allocationDetails,omitempty"`
}

type GPUNodeAllocationDetails struct {
Expand Down
4 changes: 2 additions & 2 deletions api/v1/tensorfusioncluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ const (
AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
)

// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;aliyun;nvidia;tencent;runpod;mock
// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;mock
type ComputingVendorName string

const (
Expand All @@ -139,7 +139,7 @@ const (
ComputingVendorVultr ComputingVendorName = "vultr"
ComputingVendorTogetherAI ComputingVendorName = "together-ai"
ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs"
ComputingVendorAliyun ComputingVendorName = "aliyun"
ComputingVendorAlibaba ComputingVendorName = "alibaba"
ComputingVendorNvidia ComputingVendorName = "nvidia"
ComputingVendorTencent ComputingVendorName = "tencent"
ComputingVendorRunPod ComputingVendorName = "runpod"
Expand Down
18 changes: 13 additions & 5 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion charts/tensor-fusion/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.1.4
version: 1.2.1

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
2 changes: 0 additions & 2 deletions charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,9 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
required:
- allocationDetails
- availableTFlops
- availableVRAM
- kubernetesNodeName
- loadedModels
- managedGPUs
- phase
- totalGPUs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ spec:
- openshift
- vultr
- together-ai
- aliyun
- alibaba
- nvidia
- tencent
- runpod
Expand Down
7 changes: 7 additions & 0 deletions charts/tensor-fusion/templates/controller-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ spec:
- name: cert
readOnly: true
mountPath: /tmp/k8s-webhook-server/serving-certs
- name: cloud-vendor-credentials
mountPath: /tmp/secret
readOnly: true
{{- if .Values.agent.agentId }}
- name: cluster-agent
image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
Expand Down Expand Up @@ -98,6 +101,10 @@ spec:
configMap:
name: {{ include "tensor-fusion.fullname" . }}-vector-config
defaultMode: 420
- name: cloud-vendor-credentials
secret:
secretName: tf-cloud-vendor-credentials
defaultMode: 420
- name: logs
emptyDir: {}
{{- with .Values.controller.affinity }}
Expand Down
12 changes: 12 additions & 0 deletions charts/tensor-fusion/templates/vendor-credentials.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{{- if not (lookup "v1" "Secret" .Release.Namespace "tf-cloud-vendor-credentials") }}
apiVersion: v1
kind: Secret
metadata:
name: tf-cloud-vendor-credentials
namespace: {{ .Release.Namespace }}
annotations:
helm.sh/resource-policy: keep
data:
ak: "{{ .Values.cloudVendorCredentials.accessKey | base64encode }}"
sk: "{{ .Values.cloudVendorCredentials.secretKey | base64encode }}"
{{- end }}
7 changes: 6 additions & 1 deletion charts/tensor-fusion/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,9 @@ agent:
memory: 64Mi
limits:
cpu: 1000m
memory: 512Mi
memory: 512Mi

# Only needed if your pool is running in Provisioned mode, and the cloud vendor doesn't support IRSA or any serviceAccount like zero-credential Auth approaches
cloudVendorCredentials:
accessKey: "dummy"
secretKey: "dummy"
Loading
Loading