Skip to content

Commit

Permalink
add gce scale scenario
Browse files Browse the repository at this point in the history
  • Loading branch information
upodroid committed Jan 8, 2024
1 parent eb0a1c3 commit eda14ff
Show file tree
Hide file tree
Showing 14 changed files with 133 additions and 108 deletions.
11 changes: 9 additions & 2 deletions cmd/kops/toolbox_dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,14 @@ func RunToolboxDump(ctx context.Context, f commandutils.Factory, out io.Writer,
return fmt.Errorf("adding key to SSH agent: %w", err)
}

dumper := dump.NewLogDumper(cluster.ObjectMeta.Name, sshConfig, keyRing, options.Dir)
// look for a bastion instance and use it if exists
bastionAddress := ""
for _, instance := range d.Instances {
if strings.Contains(instance.Name, "bastion") {
bastionAddress = instance.PublicAddresses[0]
}
}
dumper := dump.NewLogDumper(bastionAddress, sshConfig, keyRing, options.Dir)

var additionalIPs []string
var additionalPrivateIPs []string
Expand All @@ -224,7 +231,7 @@ func RunToolboxDump(ctx context.Context, f commandutils.Factory, out io.Writer,
}
}

if err := dumper.DumpAllNodes(ctx, nodes, additionalIPs, additionalPrivateIPs); err != nil {
if err := dumper.DumpAllNodes(ctx, nodes, options.MaxNodes, additionalIPs, additionalPrivateIPs); err != nil {
return fmt.Errorf("error dumping nodes: %v", err)
}

Expand Down
6 changes: 3 additions & 3 deletions cmd/nodeup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ import (
)

const (
retryInterval = 30 * time.Second
procSelfExe = "/proc/self/exe"
procSelfExe = "/proc/self/exe"
)

func main() {
klog.InitFlags(nil)

var flagConf, flagCacheDir, gitVersion string
var flagRetries int
var retryInterval time.Duration
var dryrun, installSystemdUnit bool
target := "direct"

Expand All @@ -48,10 +48,10 @@ func main() {
flag.StringVar(&flagConf, "conf", "node.yaml", "configuration location")
flag.StringVar(&flagCacheDir, "cache", "/var/cache/nodeup", "the location for the local asset cache")
flag.IntVar(&flagRetries, "retries", -1, "maximum number of retries on failure: -1 means retry forever")
flag.DurationVar(&retryInterval, "retry-interval", 30*time.Second, "number of seconds to wait between tries")
flag.BoolVar(&dryrun, "dryrun", false, "Don't create cloud resources; just show what would be done")
flag.StringVar(&target, "target", target, "Target - direct, dryrun")
flag.BoolVar(&installSystemdUnit, "install-systemd-unit", installSystemdUnit, "If true, will install a systemd unit instead of running directly")

if dryrun {
target = "dryrun"
}
Expand Down
27 changes: 13 additions & 14 deletions pkg/dump/dumper.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,6 @@ import (
"k8s.io/klog/v2"
)

const (
// MaxNodesToDump is the maximum number of nodes to dump
MaxNodesToDump = 500
)

// logDumper gets all the nodes from a kubernetes cluster and dumps a well-known set of logs
type logDumper struct {
sshClientFactory sshClientFactory
Expand All @@ -51,12 +46,15 @@ type logDumper struct {
}

// NewLogDumper is the constructor for a logDumper
func NewLogDumper(clusterName string, sshConfig *ssh.ClientConfig, keyRing agent.Agent, artifactsDir string) *logDumper {
func NewLogDumper(bastionAddress string, sshConfig *ssh.ClientConfig, keyRing agent.Agent, artifactsDir string) *logDumper {
sshClientFactory := &sshClientFactoryImplementation{
bastion: "bastion." + clusterName,
keyRing: keyRing,
sshConfig: sshConfig,
}
if bastionAddress != "" {
log.Printf("detected a bastion instance, with the address: %s", bastionAddress)
sshClientFactory.bastion = bastionAddress
}

d := &logDumper{
sshClientFactory: sshClientFactory,
Expand Down Expand Up @@ -106,9 +104,10 @@ func NewLogDumper(clusterName string, sshConfig *ssh.ClientConfig, keyRing agent
// if the IPs are not found from kubectl get nodes, then these will be dumped also.
// This allows for dumping log on nodes even if they don't register as a kubernetes
// node, or if a node fails to register, or if the whole cluster fails to start.
func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, additionalIPs, additionalPrivateIPs []string) error {
func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, maxNodesToDump int, additionalIPs, additionalPrivateIPs []string) error {
var special, regular, dumped []*corev1.Node

log.Printf("starting to dump %d nodes fetched through the Kubernetes APIs", len(nodes.Items))
for i := range nodes.Items {
node := &nodes.Items[i]

Expand Down Expand Up @@ -139,8 +138,8 @@ func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, add
}

for i := range regular {
if len(dumped) >= MaxNodesToDump {
log.Printf("stopping dumping nodes: %d nodes dumped", MaxNodesToDump)
if len(dumped) >= maxNodesToDump {
log.Printf("stopping dumping nodes: %d nodes dumped", maxNodesToDump)
return nil
}
node := regular[i]
Expand All @@ -154,8 +153,8 @@ func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, add

notDumped := findInstancesNotDumped(additionalIPs, dumped)
for _, ip := range notDumped {
if len(dumped) >= MaxNodesToDump {
log.Printf("stopping dumping nodes: %d nodes dumped", MaxNodesToDump)
if len(dumped) >= maxNodesToDump {
log.Printf("stopping dumping nodes: %d nodes dumped", maxNodesToDump)
return nil
}
err := d.dumpNotRegistered(ctx, ip, false)
Expand All @@ -166,8 +165,8 @@ func (d *logDumper) DumpAllNodes(ctx context.Context, nodes corev1.NodeList, add

notDumped = findInstancesNotDumped(additionalPrivateIPs, dumped)
for _, ip := range notDumped {
if len(dumped) >= MaxNodesToDump {
log.Printf("stopping dumping nodes: %d nodes dumped", MaxNodesToDump)
if len(dumped) >= maxNodesToDump {
log.Printf("stopping dumping nodes: %d nodes dumped", maxNodesToDump)
return nil
}
err := d.dumpNotRegistered(ctx, ip, true)
Expand Down
2 changes: 2 additions & 0 deletions pkg/model/gcemodel/firewall.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ func (b *FirewallModelBuilder) Build(c *fi.CloudupModelBuilderContext) error {
// https://cloud.google.com/load-balancing/docs/health-checks
"35.191.0.0/16",
"130.211.0.0/22",
"209.85.204.0/22",
"209.85.152.0/22",
},
TargetTags: []string{b.GCETagForRole(kops.InstanceGroupRoleControlPlane)},
Allowed: []string{"tcp"},
Expand Down
6 changes: 6 additions & 0 deletions pkg/resources/gce/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ func DumpManagedInstance(op *resources.DumpOperation, r *resources.Resource) err
klog.Warningf("instance %q not found", instance.Instance)
} else {
for _, ni := range instanceDetails.NetworkInterfaces {
if ni.NetworkIP != "" {
i.PrivateAddresses = append(i.PrivateAddresses, ni.NetworkIP)
}
if ni.Ipv6Address != "" {
i.PrivateAddresses = append(i.PrivateAddresses, ni.Ipv6Address)
}
for _, ac := range ni.AccessConfigs {
if ac.NatIP != "" {
i.PublicAddresses = append(i.PublicAddresses, ac.NatIP)
Expand Down
2 changes: 2 additions & 0 deletions tests/e2e/kubetest2-kops/deployer/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ func (d *deployer) initialize() error {
d.SSHPublicKeyPath = publicKey
}
d.createBucket = true
} else if d.SSHPrivateKeyPath == "" && os.Getenv("KUBE_SSH_KEY_PATH") != "" {
d.SSHPrivateKeyPath = os.Getenv("KUBE_SSH_KEY_PATH")
}
}

Expand Down
11 changes: 11 additions & 0 deletions tests/e2e/kubetest2-kops/deployer/dumplogs.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ func (d *deployer) DumpClusterLogs() error {
"--private-key", d.SSHPrivateKeyPath,
"--ssh-user", d.SSHUser,
}
version, err := kops.GetVersion(d.KopsBinaryPath)
if err != nil {
return err
}
if version > "1.29" {
// we don't want to dump from more than 10 nodes
// kops has logic to dump the control plane instances first
// remove this code once kops 1.28 is EoL
args = append(args, "--max-nodes", "10")
}

klog.Info(strings.Join(args, " "))
cmd := exec.Command(args[0], args[1:]...)
cmd.SetEnv(append(d.env(), "KOPS_TOOLBOX_DUMP_K8S_RESOURCES=1")...)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/kubetest2-kops/deployer/up.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ func (d *deployer) createCluster(zones []string, adminAccess string, yes bool) e
args = appendIfUnset(args, "--project", d.GCPProject)
}
// set some sane default e2e testing behaviour on gce
args = appendIfUnset(args, "--gce-service-account", "default")
args = appendIfUnset(args, "--networking", "kubenet")
args = appendIfUnset(args, "--node-volume-size", "100")

Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/scenarios/build/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ kubetest2 kops -v=6 \
--up --down --build --build-kubernetes=true --target-build-arch=linux/amd64 \
--cloud-provider=gce --admin-access=0.0.0.0/0 \
--kops-version-marker=https://storage.googleapis.com/kops-ci/bin/latest-ci.txt \
--create-args "--networking=kubenet --set=spec.nodeProblemDetector.enabled=true" \
--create-args "--gce-service-account=default --networking=kubenet --set=spec.nodeProblemDetector.enabled=true" \
--test=kops \
-- \
--ginkgo-args="--debug" \
Expand Down
Loading

0 comments on commit eda14ff

Please sign in to comment.