Skip to content

Commit

Permalink
Refactor label output
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Lezar <elezar@nvidia.com>
  • Loading branch information
elezar committed Mar 1, 2024
1 parent ca84c1b commit cfcdcce
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 192 deletions.
21 changes: 11 additions & 10 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,15 @@ func start(c *cli.Context, cfg *Config) error {
}
klog.Info("Start running")
d := &gfd{
manager: manager,
vgpu: vgpul,
config: config,
clientsets: clientSets,
nodeconfig: cfg.nodeConfig,
manager: manager,
vgpu: vgpul,
config: config,

labelOutputer: lm.NewOutputer(
config,
cfg.nodeConfig,
clientSets,
),
}
restart, err := d.run(sigs)
if err != nil {
Expand All @@ -186,8 +190,7 @@ type gfd struct {
vgpu vgpu.Interface
config *spec.Config

clientsets flags.ClientSets
nodeconfig flags.NodeConfig
labelOutputer lm.Outputer
}

func (d *gfd) run(sigs chan os.Signal) (bool, error) {
Expand Down Expand Up @@ -229,9 +232,7 @@ rerun:
}

klog.Info("Creating Labels")
useNodeFeatureAPI := d.config.Flags.UseNodeFeatureAPI != nil && *d.config.Flags.UseNodeFeatureAPI
err = labels.Output(*d.config.Flags.GFD.OutputFile, useNodeFeatureAPI, d.nodeconfig, d.clientsets)
if err != nil {
if err := d.labelOutputer.Output(labels); err != nil {
return false, err
}

Expand Down
30 changes: 18 additions & 12 deletions cmd/gpu-feature-discovery/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"github.com/stretchr/testify/require"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/flags"
"github.com/NVIDIA/k8s-device-plugin/internal/lm"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
rt "github.com/NVIDIA/k8s-device-plugin/internal/resource/testing"
"github.com/NVIDIA/k8s-device-plugin/internal/vgpu"
Expand Down Expand Up @@ -113,9 +115,10 @@ func TestRunOneshot(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down Expand Up @@ -164,9 +167,10 @@ func TestRunWithNoTimestamp(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down Expand Up @@ -227,9 +231,10 @@ func TestRunSleep(t *testing.T) {
var runError error
go func() {
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
runRestart, runError = d.run(sigs)
}()
Expand Down Expand Up @@ -386,9 +391,10 @@ func TestFailOnNVMLInitError(t *testing.T) {
nvmlMock := rt.NewManagerMockWithDevices(rt.NewFullGPU()).WithErrorOnInit(tc.errorOnInit)

d := gfd{
manager: resource.WithConfig(nvmlMock, conf),
vgpu: vgpuMock,
config: conf,
manager: resource.WithConfig(nvmlMock, conf),
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
if tc.expectError {
Expand Down
37 changes: 22 additions & 15 deletions cmd/gpu-feature-discovery/mig_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"github.com/stretchr/testify/require"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/flags"
"github.com/NVIDIA/k8s-device-plugin/internal/lm"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
rt "github.com/NVIDIA/k8s-device-plugin/internal/resource/testing"
)
Expand Down Expand Up @@ -46,9 +48,10 @@ func TestMigStrategyNone(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down Expand Up @@ -103,9 +106,10 @@ func TestMigStrategySingleForNoMigDevices(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down Expand Up @@ -167,9 +171,10 @@ func TestMigStrategySingleForMigDeviceMigDisabled(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down Expand Up @@ -231,9 +236,10 @@ func TestMigStrategySingle(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down Expand Up @@ -296,9 +302,10 @@ func TestMigStrategyMixed(t *testing.T) {
defer removeMachineFile(t)

d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
Expand Down
155 changes: 0 additions & 155 deletions internal/lm/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,165 +16,10 @@

package lm

import (
"bytes"
"context"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"

"github.com/NVIDIA/k8s-device-plugin/internal/flags"

nfdv1alpha1 "sigs.k8s.io/node-feature-discovery/pkg/apis/nfd/v1alpha1"
nfdclientset "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"

apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const nodeFeatureVendorPrefix = "nvidia-features-for"

// Labels defines a type for labels
type Labels map[string]string

// Labels also implements the Labeler interface
func (labels Labels) Labels() (Labels, error) {
return labels, nil
}

// Output creates labels according to the specified output format.
func (labels Labels) Output(path string, nodeFeatureAPI bool, nodeConifg flags.NodeConfig, clientSets flags.ClientSets) error {
if nodeFeatureAPI {
log.Print("Writing labels to NodeFeature CR")
return labels.UpdateNodeFeatureObject(nodeConifg, clientSets.NFD)
}

return labels.UpdateFile(path)
}

// UpdateFile writes labels to the specified path. The file is written atomocally
func (labels Labels) UpdateFile(path string) error {
log.Printf("Writing labels to output file %s", path)

if path == "" {
_, err := labels.WriteTo(os.Stdout)
return err
}

output := new(bytes.Buffer)
if _, err := labels.WriteTo(output); err != nil {
return fmt.Errorf("error writing labels to buffer: %v", err)
}
err := writeFileAtomically(path, output.Bytes(), 0644)
if err != nil {
return fmt.Errorf("error atomically writing file '%s': %v", path, err)
}
return nil
}

// WriteTo writes labels to the specified writer
func (labels Labels) WriteTo(output io.Writer) (int64, error) {
var total int64
for k, v := range labels {
n, err := fmt.Fprintf(output, "%s=%s\n", k, v)
total += int64(n)
if err != nil {
return total, err
}
}

return total, nil
}

func writeFileAtomically(path string, contents []byte, perm os.FileMode) error {
absPath, err := filepath.Abs(path)
if err != nil {
return fmt.Errorf("failed to retrieve absolute path of output file: %v", err)
}

absDir := filepath.Dir(absPath)
tmpDir := filepath.Join(absDir, "gfd-tmp")

err = os.MkdirAll(tmpDir, os.ModePerm)
if err != nil && !os.IsExist(err) {
return fmt.Errorf("failed to create temporary directory: %v", err)
}
defer func() {
if err != nil {
os.RemoveAll(tmpDir)
}
}()

tmpFile, err := os.CreateTemp(tmpDir, "gfd-")
if err != nil {
return fmt.Errorf("fail to create temporary output file: %v", err)
}
defer func() {
if err != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
}
}()

err = os.WriteFile(tmpFile.Name(), contents, perm)
if err != nil {
return fmt.Errorf("error writing temporary file '%v': %v", tmpFile.Name(), err)
}

err = os.Rename(tmpFile.Name(), path)
if err != nil {
return fmt.Errorf("error moving temporary file to '%v': %v", path, err)
}

err = os.Chmod(path, perm)
if err != nil {
return fmt.Errorf("error setting permissions on '%v': %v", path, err)
}

return nil
}

// UpdateNodeFeatureObject creates/updates the node-specific NodeFeature custom resource.
func (labels Labels) UpdateNodeFeatureObject(nodeConfig flags.NodeConfig, cli nfdclientset.Interface) error {
nodename := nodeConfig.Name
namespace := nodeConfig.Namespace
nodeFeatureName := strings.Join([]string{nodeFeatureVendorPrefix, nodename}, "-")

if nfr, err := cli.NfdV1alpha1().NodeFeatures(namespace).Get(context.TODO(), nodeFeatureName, metav1.GetOptions{}); errors.IsNotFound(err) {
log.Printf("creating NodeFeature object %s", nodeFeatureName)
nfr = &nfdv1alpha1.NodeFeature{
TypeMeta: metav1.TypeMeta{},
ObjectMeta: metav1.ObjectMeta{Name: nodeFeatureName, Labels: map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename}},
Spec: nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels},
}

nfrCreated, err := cli.NfdV1alpha1().NodeFeatures(namespace).Create(context.TODO(), nfr, metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("failed to create NodeFeature object %q: %w", nfr.Name, err)
}

log.Printf("NodeFeature object created: %v", nfrCreated)
} else if err != nil {
return fmt.Errorf("failed to get NodeFeature object: %w", err)
} else {
nfrUpdated := nfr.DeepCopy()
nfrUpdated.Labels = map[string]string{nfdv1alpha1.NodeFeatureObjNodeNameLabel: nodename}
nfrUpdated.Spec = nfdv1alpha1.NodeFeatureSpec{Features: *nfdv1alpha1.NewFeatures(), Labels: labels}

if !apiequality.Semantic.DeepEqual(nfr, nfrUpdated) {
log.Printf("updating NodeFeature object %s", nodeFeatureName)
nfrUpdated, err = cli.NfdV1alpha1().NodeFeatures(namespace).Update(context.TODO(), nfrUpdated, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("failed to update NodeFeature object %q: %w", nfr.Name, err)
}
log.Printf("NodeFeature object updated: %v", nfrUpdated)
} else {
log.Printf("no changes in NodeFeature object, not updating")
}
}
return nil
}
Loading

0 comments on commit cfcdcce

Please sign in to comment.