Skip to content

Commit dc733cb

Browse files
committed
xelink support for gpu_fakedev
1 parent f5caba8 commit dc733cb

File tree

4 files changed

+91
-74
lines changed

4 files changed

+91
-74
lines changed

cmd/gpu_fakedev/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,8 @@ file, but each new device variant adding feature(s) that have specific
2626
support in device plugin, could have their own fake device config.
2727

2828
Xelink support: There two ways to generate Xelink info.
29-
1. Use xelink-topology FULL
30-
2. Use direct xelink connection line
31-
example: "xelink-connections": "0.0-0.1_0.0-1.0_0.0-1.1_0.0-2.0_0.0-2.1_0.0-3.0_0.0-3.1_0.0-4.0_0.0-4.1_0.0-5.0_0.0-5.1_0.0-6.0_0.0-6.1_0.0-7.0_0.0-7.1_0.1-1"
29+
1. Use xelink-topology FULL: See example: `config/8x2-PVC-xelink.json`
30+
2. Use direct xelink connection line: See example: `config/2x4-PVC-xelink.json`
3231

3332
## Potential improvements
3433

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"Info": "2x 4 tile 4 GiB PVC [Ponte Vecchio] GPUs",
3+
"DevCount": 2,
4+
"TilesPerDev": 4,
5+
"DevsPerNode": 1,
6+
"DevMemSize": 4294967296,
7+
"Capabilities": {
8+
"platform": "fake_PVC",
9+
"connections": "0.1-0.0_0.2-0.0_0.3-0.0_1.0-0.0_1.1-0.0_1.2-0.0_1.3-0.0_0.2-0.1_0.3-0.1_1.0-0.1_1.1-0.1_1.2-0.1_1.3-0.1_0.3-0.2_1.0-0.2_1.1-0.2_1.2-0.2_1.3-0.2_1.0-0.3_1.1-0.3_1.2-0.3_1.3-0.3_1.1-1.0_1.2-1.0_1.3-1.0_1.2-1.1_1.3-1.1_1.3-1.2",
10+
"connection-topology": "RAW"
11+
}
12+
}
13+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"Info": "8x 4 GiB PVC [Ponte Vecchio] GPUs",
3+
"DevCount": 8,
4+
"TilesPerDev": 2,
5+
"DevsPerNode": 2,
6+
"DevMemSize": 4294967296,
7+
"Capabilities": {
8+
"platform": "fake_PVC",
9+
"connections": "",
10+
"connection-topology": "FULL"
11+
}
12+
}

cmd/gpu_fakedev/gpu_fakedev.go

Lines changed: 64 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2022 Intel Corporation. All Rights Reserved.
1+
// Copyright 2021-2023 Intel Corporation. All Rights Reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -39,13 +39,16 @@ import (
3939
"errors"
4040
"flag"
4141
"fmt"
42-
"golang.org/x/sys/unix"
4342
"io/fs"
4443
"log"
4544
"math"
4645
"os"
46+
"strings"
47+
48+
"golang.org/x/sys/unix"
4749
)
4850

51+
// https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
4952
const (
5053
dirMode = 0775
5154
fileMode = 0644
@@ -56,11 +59,11 @@ const (
5659
devfsPath = "dev"
5760
mib = 1024.0 * 1024.0
5861
// null device major, minor on linux.
59-
devNullMajor = 1
60-
devNullMinor = 3
61-
devNullType = unix.S_IFCHR
62-
maxNfdLileLen = 63
63-
fullyConnected = "FULL"
62+
devNullMajor = 1
63+
devNullMinor = 3
64+
devNullType = unix.S_IFCHR
65+
maxK8sLabelSize = 63
66+
fullyConnected = "FULL"
6467
)
6568

6669
var verbose bool
@@ -79,6 +82,11 @@ type genOptions struct {
7982
devs int
8083
}
8184

85+
func min(val1 int, val2 int) int {
86+
// TODO replace this with template version of Min(), once Golang standard library gets one
87+
return int(math.Min(float64(val1), float64(val2)))
88+
}
89+
8290
func addSysfsDriTree(root string, opts *genOptions, i int) error {
8391
card := cardBase + i
8492
base := fmt.Sprintf("%s/class/drm/card%d", root, card)
@@ -240,7 +248,7 @@ func generateDriFiles(opts genOptions) {
240248
log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'",
241249
sysfsPath, devfsPath)
242250

243-
makeXelinkSideCar(opts.Capabilities["xelink-topology"], opts.DevCount, opts.TilesPerDev, opts.Capabilities["xelink-topology"])
251+
makeXelinkSideCar(opts.Capabilities["connection-topology"], opts.DevCount, opts.TilesPerDev, opts.Capabilities["connections"])
244252

245253
opts.dirs, opts.files = 0, 0
246254
for i := 0; i < opts.DevCount; i++ {
@@ -306,76 +314,57 @@ func getOptions(name string) genOptions {
306314
return opts
307315
}
308316

309-
func makeXelinkSideCar(topology string, gpus int, tiles int, connections string) {
317+
func makeXelinkSideCar(topology string, gpus, tiles int, connections string) {
310318
if topology != fullyConnected {
311-
log.Printf("XELINK: generate xelink sidecar label file, using (GPUs: %d, Tiles: %d, Connections: %s)", gpus, tiles, connections)
319+
log.Printf("XELINK: generate xelink sidecar label file, using (GPUs: %d, Tiles: %d)", gpus, tiles)
312320
} else {
313321
log.Printf("XELINK: generate xelink sidecar label file, using (GPUs: %d, Tiles: %d, Topology: %s)", gpus, tiles, topology)
314322
}
315323

316-
smap := ""
317-
318324
if topology == fullyConnected {
319-
// create 2d slice with zeros
320-
var cmap = make([][]int, 0)
321-
322-
var total = gpus * tiles
325+
saveSideCarFile(buildConnectionList(gpus, tiles))
326+
} else {
327+
saveSideCarFile(connections)
328+
}
329+
}
323330

324-
for y := 0; y < total; y++ {
325-
var xmap = make([]int, 0)
331+
func buildConnectionList(gpus, tiles int) string {
332+
mm := 0
326333

327-
for x := 0; x < total; x++ {
328-
if x != y {
329-
xmap = append(xmap, 1)
330-
} else {
331-
xmap = append(xmap, 0)
332-
}
333-
}
334+
var nodes = make([]string, 0)
334335

335-
cmap = append(cmap, [][]int{xmap}...)
336+
for mm < gpus {
337+
nn := 0
338+
for nn < tiles {
339+
nodes = append(nodes, fmt.Sprintf("%d.%d", mm, nn))
340+
nn++
336341
}
342+
mm++
343+
}
337344

338-
// filter double connections
339-
for y := 0; y < total; y++ {
340-
for x := 0; x < total; x++ {
341-
if cmap[y][x] == cmap[x][y] {
342-
cmap[x][y] = 0
343-
}
344-
}
345-
}
345+
var links = make(map[string]struct{}, 0)
346346

347-
// make connection string
348-
smap = buildConnectionList(gpus, tiles, cmap)
349-
} else {
350-
smap = connections
351-
}
347+
var smap = make([]string, 0)
352348

353-
saveSideCarFile(smap)
354-
}
349+
for _, from := range nodes {
350+
for _, to := range nodes {
351+
// no self links, TODO ignore in-gpu xelinks
352+
if to == from {
353+
continue
354+
}
355355

356-
func buildConnectionList(gpus int, tiles int, cmap [][]int) string {
357-
smap := ""
356+
link := fmt.Sprintf("%s-%s", to, from)
358357

359-
for gpuy := 0; gpuy < gpus; gpuy++ {
360-
for tiley := 0; tiley < tiles; tiley++ {
361-
y := gpuy*tiles + tiley
358+
reverselink := fmt.Sprintf("%s-%s", from, to)
359+
if _, exists := links[reverselink]; !exists {
360+
links[link] = struct{}{}
362361

363-
for gpux := 0; gpux < gpus; gpux++ {
364-
for tilex := 0; tilex < tiles; tilex++ {
365-
x := gpux*tiles + tilex
366-
if cmap[y][x] == 1 {
367-
smap = fmt.Sprintf("%s_%d.%d-%d.%d", smap, gpuy, tiley, gpux, tilex)
368-
}
369-
}
362+
smap = append(smap, link)
370363
}
371364
}
372365
}
373366

374-
if smap != "" {
375-
smap = smap[1:]
376-
}
377-
378-
return smap
367+
return fmt.Sprintf("%s\n", strings.Join(smap, "_"))
379368
}
380369

381370
func saveSideCarFile(connections string) {
@@ -385,32 +374,36 @@ func saveSideCarFile(connections string) {
385374
}
386375
defer f.Close()
387376

388-
if len(connections) > maxNfdLileLen {
389-
line := fmt.Sprintf("xpumanager.intel.com/xe-links=%s", connections[:maxNfdLileLen])
377+
if len(connections) <= maxK8sLabelSize {
378+
line := fmt.Sprintf("xpumanager.intel.com/xe-links=%s", connections)
390379
fmt.Println(line)
391380

392381
if _, err := f.WriteString(line + "\n"); err != nil {
393382
panic(err)
394383
}
395384

396-
index := 2
385+
return
386+
}
387+
388+
// Write first line without Z prefix
389+
line := fmt.Sprintf("xpumanager.intel.com/xe-links=%s", connections[:maxK8sLabelSize])
390+
fmt.Println(line)
397391

398-
for i := maxNfdLileLen; i < len(connections); i += (maxNfdLileLen - 1) {
399-
line := fmt.Sprintf("xpumanager.intel.com/xe-links%d=Z%s", index, connections[i:int(math.Min(float64(len(connections)), float64(i+maxNfdLileLen-1)))])
400-
fmt.Println(line)
392+
if _, err := f.WriteString(line + "\n"); err != nil {
393+
panic(err)
394+
}
401395

402-
if _, err := f.WriteString(line + "\n"); err != nil {
403-
panic(err)
404-
}
405-
index++
406-
}
407-
} else {
408-
line := fmt.Sprintf("xpumanager.intel.com/xe-links=%s", connections[:int(math.Min(float64(len(connections)), float64(maxNfdLileLen)))])
396+
index := 2
397+
398+
// Write next lines with Z prefix
399+
for i := maxK8sLabelSize; i < len(connections); i += (maxK8sLabelSize - 1) {
400+
line := fmt.Sprintf("xpumanager.intel.com/xe-links%d=Z%s", index, connections[i:min(len(connections), i+maxK8sLabelSize-1)])
409401
fmt.Println(line)
410402

411403
if _, err := f.WriteString(line + "\n"); err != nil {
412404
panic(err)
413405
}
406+
index++
414407
}
415408
}
416409

0 commit comments

Comments
 (0)