@@ -37,51 +37,110 @@ import (
37
37
38
38
const (
39
39
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
40
+ monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml"
41
+ rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml"
42
+ nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml"
40
43
containerName = "testcontainer"
41
44
tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
42
45
tfPodName = "training-pod"
43
46
)
44
47
45
48
func init () {
46
- ginkgo .Describe ("GPU plugin [Device:gpu]" , describe )
49
+ // This needs to be Ordered because only one GPU plugin can function on the node at once.
50
+ ginkgo .Describe ("GPU plugin [Device:gpu]" , describe , ginkgo .Ordered )
51
+ }
52
+
53
+ func createPluginAndVerifyExistence (f * framework.Framework , ctx context.Context , kustomizationPath , baseResource string ) {
54
+ ginkgo .By ("deploying GPU plugin" )
55
+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (kustomizationPath ))
56
+
57
+ ginkgo .By ("waiting for GPU plugin's availability" )
58
+ podList , err := e2epod .WaitForPodsWithLabelRunningReady (ctx , f .ClientSet , f .Namespace .Name ,
59
+ labels.Set {"app" : "intel-gpu-plugin" }.AsSelector (), 1 /* one replica */ , 100 * time .Second )
60
+ if err != nil {
61
+ e2edebug .DumpAllNamespaceInfo (ctx , f .ClientSet , f .Namespace .Name )
62
+ e2ekubectl .LogFailedContainers (ctx , f .ClientSet , f .Namespace .Name , framework .Logf )
63
+ framework .Failf ("unable to wait for all pods to be running and ready: %v" , err )
64
+ }
65
+
66
+ ginkgo .By ("checking GPU plugin's securityContext" )
67
+ if err = utils .TestPodsFileSystemInfo (podList .Items ); err != nil {
68
+ framework .Failf ("container filesystem info checks failed: %v" , err )
69
+ }
70
+
71
+ ginkgo .By ("checking if the resource is allocatable" )
72
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , v1 .ResourceName (baseResource ), 30 * time .Second , utils .WaitOpGreater ); err != nil {
73
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
74
+ }
47
75
}
48
76
49
77
func describe () {
50
78
f := framework .NewDefaultFramework ("gpuplugin" )
51
79
f .NamespacePodSecurityEnforceLevel = admissionapi .LevelPrivileged
52
80
53
- kustomizationPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
81
+ vanillaPath , errFailedToLocateRepoFile := utils .LocateRepoFile (kustomizationYaml )
54
82
if errFailedToLocateRepoFile != nil {
55
83
framework .Failf ("unable to locate %q: %v" , kustomizationYaml , errFailedToLocateRepoFile )
56
84
}
57
85
58
- ginkgo .BeforeEach (func (ctx context.Context ) {
59
- ginkgo .By ("deploying GPU plugin" )
60
- e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (kustomizationPath ))
61
-
62
- ginkgo .By ("waiting for GPU plugin's availability" )
63
- podList , err := e2epod .WaitForPodsWithLabelRunningReady (ctx , f .ClientSet , f .Namespace .Name ,
64
- labels.Set {"app" : "intel-gpu-plugin" }.AsSelector (), 1 /* one replica */ , 100 * time .Second )
65
- if err != nil {
66
- e2edebug .DumpAllNamespaceInfo (ctx , f .ClientSet , f .Namespace .Name )
67
- e2ekubectl .LogFailedContainers (ctx , f .ClientSet , f .Namespace .Name , framework .Logf )
68
- framework .Failf ("unable to wait for all pods to be running and ready: %v" , err )
69
- }
70
-
71
- ginkgo .By ("checking GPU plugin's securityContext" )
72
- if err = utils .TestPodsFileSystemInfo (podList .Items ); err != nil {
73
- framework .Failf ("container filesystem info checks failed: %v" , err )
74
- }
75
- })
86
+ monitoringPath , errFailedToLocateRepoFile := utils .LocateRepoFile (monitoringYaml )
87
+ if errFailedToLocateRepoFile != nil {
88
+ framework .Failf ("unable to locate %q: %v" , monitoringYaml , errFailedToLocateRepoFile )
89
+ }
76
90
77
- ginkgo .Context ("When GPU resources are available [Resource:i915]" , func () {
78
- ginkgo .BeforeEach (func (ctx context.Context ) {
79
- ginkgo .By ("checking if the resource is allocatable" )
80
- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second ); err != nil {
81
- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
91
+ nfdRulesPath , errFailedToLocateRepoFile := utils .LocateRepoFile (nfdRulesYaml )
92
+ if errFailedToLocateRepoFile != nil {
93
+ framework .Failf ("unable to locate %q: %v" , nfdRulesYaml , errFailedToLocateRepoFile )
94
+ }
95
+
96
+ resourceManagerPath , errFailedToLocateRepoFile := utils .LocateRepoFile (rmEnabledYaml )
97
+ if errFailedToLocateRepoFile != nil {
98
+ framework .Failf ("unable to locate %q: %v" , rmEnabledYaml , errFailedToLocateRepoFile )
99
+ }
100
+
101
+ ginkgo .Context ("When GPU plugin is deployed [Resource:i915]" , func () {
102
+ ginkgo .AfterEach (func (ctx context.Context ) {
103
+ framework .Logf ("Removing gpu-plugin manually" )
104
+
105
+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "delete" , "-k" , filepath .Dir (vanillaPath ))
106
+
107
+ // Wait for resources to go to zero
108
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915" , 30 * time .Second , utils .WaitOpZero ); err != nil {
109
+ framework .Failf ("unable to wait for nodes to have no resources: %v" , err )
82
110
}
83
111
})
112
+
84
113
ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
114
+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
115
+
116
+ podListFunc := framework .ListObjects (f .ClientSet .CoreV1 ().Pods (f .Namespace .Name ).List , metav1.ListOptions {})
117
+
118
+ pods , err := podListFunc (ctx )
119
+ if err != nil {
120
+ framework .Failf ("Couldn't list pods: %+v" , err )
121
+ }
122
+
123
+ if len (pods .Items ) != 1 {
124
+ framework .Failf ("Invalid amount of Pods listed %d" , len (pods .Items ))
125
+ }
126
+
127
+ pluginPod := pods .Items [0 ]
128
+
129
+ ginkgo .By ("checking if CDI path is included in volumes" )
130
+ found := false
131
+ for _ , v := range pluginPod .Spec .Volumes {
132
+ if v .HostPath != nil && v .HostPath .Path == "/var/run/cdi" {
133
+ framework .Logf ("CDI volume found" )
134
+ found = true
135
+
136
+ break
137
+ }
138
+ }
139
+
140
+ if ! found {
141
+ framework .Fail ("Couldn't find CDI volume in GPU plugin deployment" )
142
+ }
143
+
85
144
ginkgo .By ("submitting a pod requesting GPU resources" )
86
145
podSpec := & v1.Pod {
87
146
ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
@@ -122,7 +181,41 @@ func describe() {
122
181
framework .Logf ("found card and renderD from the log" )
123
182
})
124
183
184
+ ginkgo .Context ("When [Deployment:monitoring] deployment is applied [Resource:i915]" , func () {
185
+ ginkgo .It ("check if monitoring resource is available" , func (ctx context.Context ) {
186
+ createPluginAndVerifyExistence (f , ctx , monitoringPath , "gpu.intel.com/i915" )
187
+
188
+ ginkgo .By ("checking if the monitoring resource is allocatable" )
189
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/i915_monitoring" , 30 * time .Second , utils .WaitOpGreater ); err != nil {
190
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
191
+ }
192
+ })
193
+ })
194
+
195
+ ginkgo .Context ("When [Deployment:resourceManager] deployment is applied [Resource:i915]" , func () {
196
+ ginkgo .It ("check if i915 resources is available" , func (ctx context.Context ) {
197
+ e2ekubectl .RunKubectlOrDie (f .Namespace .Name , "apply" , "-k" , filepath .Dir (nfdRulesPath ))
198
+
199
+ createPluginAndVerifyExistence (f , ctx , resourceManagerPath , "gpu.intel.com/i915" )
200
+
201
+ // To speed up extended resource detection, let's restart NFD worker
202
+ e2ekubectl .RunKubectlOrDie ("node-feature-discovery" , "rollout" , "restart" , "daemonset" , "nfd-worker" )
203
+
204
+ ginkgo .By ("checking if the millicores resource is allocatable" )
205
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/millicores" , 30 * time .Second , utils .WaitOpGreater ); err != nil {
206
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
207
+ }
208
+
209
+ ginkgo .By ("checking if the tiles resource is allocatable" )
210
+ if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/tiles" , 30 * time .Second , utils .WaitOpGreater ); err != nil {
211
+ framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
212
+ }
213
+ })
214
+ })
215
+
125
216
ginkgo .It ("run a small workload on the GPU [App:tensorflow]" , func (ctx context.Context ) {
217
+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/i915" )
218
+
126
219
kustomYaml , err := utils .LocateRepoFile (tfKustomizationYaml )
127
220
if err != nil {
128
221
framework .Failf ("unable to locate %q: %v" , tfKustomizationYaml , err )
@@ -146,13 +239,9 @@ func describe() {
146
239
})
147
240
148
241
ginkgo .Context ("When GPU resources are available [Resource:xe]" , func () {
149
- ginkgo .BeforeEach (func (ctx context.Context ) {
150
- ginkgo .By ("checking if the resource is allocatable" )
151
- if err := utils .WaitForNodesWithResource (ctx , f .ClientSet , "gpu.intel.com/xe" , 30 * time .Second ); err != nil {
152
- framework .Failf ("unable to wait for nodes to have positive allocatable resource: %v" , err )
153
- }
154
- })
155
242
ginkgo .It ("checks availability of GPU resources [App:busybox]" , func (ctx context.Context ) {
243
+ createPluginAndVerifyExistence (f , ctx , vanillaPath , "gpu.intel.com/xe" )
244
+
156
245
ginkgo .By ("submitting a pod requesting GPU resources" )
157
246
podSpec := & v1.Pod {
158
247
ObjectMeta : metav1.ObjectMeta {Name : "gpuplugin-tester" },
0 commit comments