Merge pull request #1374 from tkatila/e2e-gpu-tf

hj-johannes-lee · web-flow · commit 6861ef5106da · 2023-08-22T17:20:10.000+03:00
e2e: gpu: add a basic tensorflow test
diff --git a/deployments/gpu_tensorflow_test/deployment.yaml b/deployments/gpu_tensorflow_test/deployment.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: training-pod
+spec:
+  restartPolicy: Never
+  containers:
+  - name: testcontainer
+    image: intel/intel-extension-for-tensorflow:latest
+    imagePullPolicy: IfNotPresent
+    securityContext:
+      allowPrivilegeEscalation: false
+    command: ["/bin/sh", "-c"]
+    args: ["python /code/training.py"]
+    resources:
+      limits:
+        gpu.intel.com/i915: 1
+      requests:
+        gpu.intel.com/i915: 1
+    volumeMounts:
+    - mountPath: /code
+      name: code
+  volumes:
+  - configMap:
+      name: training-code
+    name: code
diff --git a/deployments/gpu_tensorflow_test/kustomization.yaml b/deployments/gpu_tensorflow_test/kustomization.yaml
@@ -0,0 +1,11 @@
+configMapGenerator:
+- name: training-code
+  files:
+  - training.py
+
+resources:
+  - deployment.yaml
+
+images:
+  - name: intel/intel-extension-for-tensorflow
+    newTag: 1.2.0-gpu
diff --git a/deployments/gpu_tensorflow_test/training.py b/deployments/gpu_tensorflow_test/training.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors.
+# Copyright 2023 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# original code from:
+# https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l02c01_celsius_to_fahrenheit.ipynb
+# this is slightly modified to run explicitly with XPU devices
+
+import tensorflow as tf
+import intel_extension_for_tensorflow as itex
+import numpy as np
+
+print("BACKENDS: ", str(itex.get_backend()))
+
+devs = tf.config.list_physical_devices('XPU')
+
+print(devs)
+
+if not devs:
+  raise Exception("No devices found")
+
+with tf.device("/xpu:0"):
+  celsius_q    = np.array([-40, -10,  0,  8, 15, 22,  38],  dtype=float)
+  fahrenheit_a = np.array([-40,  14, 32, 46, 59, 72, 100],  dtype=float)
+
+  model = tf.keras.Sequential([
+    tf.keras.layers.Dense(units=1, input_shape=[1])
+  ])
+
+  model.compile(loss='mean_squared_error',
+                optimizer=tf.keras.optimizers.Adam(0.1))
+
+  history = model.fit(celsius_q, fahrenheit_a, epochs=500, verbose=False)
+
+  print("model trained")
+
+  test = [100.0]
+  p = model.predict(test)
+
+  if len(p) != 1:
+    raise Exception("invalid result obj")
+
+  prediction = p[0]
+
+  if prediction >= 211 and prediction <= 213:
+    print("inference ok: %f" % prediction)
+  else:
+    raise Exception("bad prediction %f" % prediction)
+
+  print("SUCCESS")
diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/intel/intel-device-plugins-for-kubernetes/test/e2e/utils"
 	"github.com/onsi/ginkgo/v2"
+	"github.com/onsi/gomega"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -35,8 +36,10 @@ import (
 )
 
 const (
-	kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
-	containerName     = "testcontainer"
+	kustomizationYaml   = "deployments/gpu_plugin/kustomization.yaml"
+	containerName       = "testcontainer"
+	tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
+	tfPodName           = "training-pod"
 )
 
 func init() {
@@ -118,5 +121,23 @@ func describe() {
 
 			framework.Logf("found card and renderD from the log")
 		})
+
+		ginkgo.It("run a small workload on the GPU", func(ctx context.Context) {
+			kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
+			if err != nil {
+				framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err)
+			}
+
+			ginkgo.By("submitting demo deployment")
+
+			e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomYaml))
+
+			ginkgo.By("waiting the pod to finish")
+
+			err = e2epod.WaitForPodSuccessInNamespaceTimeout(ctx, f.ClientSet, tfPodName, f.Namespace.Name, 300*time.Second)
+			gomega.Expect(err).To(gomega.BeNil(), utils.GetPodLogs(ctx, f, tfPodName, containerName))
+
+			framework.Logf("tensorflow execution succeeded!")
+		})
 	})
 }