Kafka read error with SASL auth: TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.] + GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed #205

ekeric13 · 2022-09-21T22:45:21Z

I have a service that continuosly writes (and then reads) from the same topic every 5 seconds.

I have two different clients using franz-go, one with TLS auth and one with SASL via msk_iam auth.

My writer seems to be fine but errors every few hours. The errors are always the same.

TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.]

I will get 200k of these errors over the course of 4 minutes and then everything resumes fine again.

My reader code was based heavily on this example:
https://github.com/twmb/franz-go/blob/master/examples/goroutine_per_partition_consuming/autocommit_marks/main.go

This is a snipped of the code:

for {
	fetches := r.service.PollRecords(ctx, 1000)
	err := fetches.Err0()
	if err != nil {
		if errors.Is(err, kafka.ErrClientClosed) {
			contextLogger.Error().Err(err).Msg("Client closed")
			return err
		}
		if errors.Is(err, context.Canceled) {
			contextLogger.Error().Err(err).Msg("Context cancelled closed")
			return err
		}
	}
	fetches.EachError(func(topic string, partition int32, err error) {
		contextLogger.Error().
			Err(err).
			Str("Topic", topic).
			Int32("Partition", partition).
			Msg("Cannot fetch message")
	})
	fetches.EachPartition(func(fetchedTopicPartition kafka.FetchTopicPartition) {
		tp := topicPartition{
			topic:     fetchedTopicPartition.Topic,
			partition: fetchedTopicPartition.Partition,
		}

		r.consumers[tp].records <- fetchedTopicPartition
	})

	r.service.AllowRebalance()
}

And this is how the client is instantiated:

	franzGoReader := &FranzGoReader{
		consumers: make(map[topicPartition]*franzGoPartitionReader),
		log:       contextLogger,
	}

	kafkaAddr := strings.Split(brokersEndpoint, ",")
	clientOpts := []kafka.Opt{
		kafka.SeedBrokers(kafkaAddr...),
		kafka.ConsumeTopics(kafkaTopic),
		kafka.ConsumerGroup(consumerGroup),
		// default is CooperativeStickyBalancer
		kafka.Balancers(kafka.RangeBalancer()),
		kafka.ConsumeResetOffset(kafka.NewOffset().AtStart()),
		kafka.OnPartitionsAssigned(franzGoReader.assigned),
		kafka.OnPartitionsRevoked(franzGoReader.revoked),
		kafka.OnPartitionsLost(franzGoReader.lost),
		kafka.AutoCommitMarks(),
		kafka.AutoCommitInterval(3 * time.Second),
		kafka.BlockRebalanceOnPoll(),
		kafka.WithLogger(&franzGoLogger{
			log: contextLogger,
		}),
	}
	if useAuth {
		clientOpts = append(clientOpts, kafka.DialTLSConfig(&tls.Config{}))

		sess, err := session.NewSession()
		if err != nil {
			contextLogger.Error().Err(err).Msg("Could not get aws session")
		}
		clientOpts = append(clientOpts, kafka.SASL(aws.ManagedStreamingIAM(func(ctx context.Context) (aws.Auth, error) {
			val, err := sess.Config.Credentials.GetWithContext(ctx)
			if err != nil {
				return aws.Auth{}, err
			}
			return aws.Auth{
				AccessKey:    val.AccessKeyID,
				SecretKey:    val.SecretAccessKey,
				SessionToken: val.SessionToken,
				UserAgent:    "kafka-health-monitor",
			}, nil
		})))
	}

	cl, err := kafka.NewClient(clientOpts...)
	if err != nil {
		contextLogger.Error().Err(err).Msg("Could not create FranzGo client")
		return nil, err
	}

partitions assigned callback:

func (r *FranzGoReader) assigned(_ context.Context, cl *kafka.Client, assignedTopicPartitions map[string][]int32) {
	contextLogger := r.log.With().Logger()
	contextLogger.Debug().Msg("Partitions assigned")

	for topic, partitions := range assignedTopicPartitions {
		for _, partition := range partitions {
			contextLoggerPartionReader := contextLogger.With().
				Str("Topic", topic).
				Int32("Partition", partition).
				Logger()
			partitionReader := &franzGoPartitionReader{
				service:   cl,
				log:       contextLoggerPartionReader,
				topic:     topic,
				partition: partition,

				quit:    make(chan struct{}),
				done:    make(chan struct{}),
				records: make(chan kafka.FetchTopicPartition, 5),
			}
			r.consumers[topicPartition{topic, partition}] = partitionReader
			go partitionReader.consume()
		}
	}
}

partitions lost callback:

func (r *FranzGoReader) lost(_ context.Context, _ *kafka.Client, topicPartionsLost map[string][]int32) {
	contextLogger := r.log.With().Logger()
	contextLogger.Debug().Msg("Partitions lost")

	r.killPartitionReaders(topicPartionsLost)
}

func (r *FranzGoReader) killPartitionReaders(topicPartions map[string][]int32) {
	contextLogger := r.log.With().Logger()
	contextLogger.Debug().Msg("Kill partitionReaders")

	wg := &sync.WaitGroup{}
	defer wg.Wait()

	for topic, partitions := range topicPartions {
		for _, partition := range partitions {
			wg.Add(1)

			tp := topicPartition{topic, partition}
			partitionReader := r.consumers[tp]
			delete(r.consumers, tp)
			go func() {
				partitionReader.close()
				wg.Done()
			}()
		}
	}
}

func (pr *franzGoPartitionReader) close() {
	contextLogger := pr.log.With().Logger()
	contextLogger.Debug().Msg("Closing partitionReader")

	close(pr.quit)
	contextLogger.Debug().Msg("Waiting for work to finish")
	<-pr.done
}

The text was updated successfully, but these errors were encountered:

twmb · 2022-09-22T08:38:01Z

Is it possible to grab some logs before the auth failure? Specifically around the broker dialing and connecting. The code looks fine, so something else may be going on. Connections should be reauthenticating, and they should reauthenticate quickly at the right moment. Perhaps something odd is happening on the AWS side that this client "should" account for even more.

ekeric13 · 2022-09-23T20:49:03Z

So I have a small "bug" in how I am running the code and let me know if you think this matters (I didn't think it should).
I am only using 1 partition for my topic, but I am consuming it using 2 replicas with the same consumer group name. Based on the logs it seems like the other host randomly causes issues. If this is an issue, is there a way I can better coordinate between the two consumers?

Here are the logs and I will add a bit more code in my above comment for context:

To begin with everything is just getting logged on host A and host B has no logs for hours.
Then randomly at 21:43:18 Host B goes from having no logs to this pattern and continues until 21:46:12:

"2022-09-22T21:43:18.462Z","""i-0d9f68e9e771898f6""","heartbeat errored"
{
  err | GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group | infra_kafka-health-monitor
  InternalLib | FranzGo
  level | info
}

"2022-09-22T21:43:18.462Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:43:18.462Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:43:18.462Z","""i-0d9f68e9e771898f6""","assigning partitions"
{
  how | unassigning everything
  input |  
  InternalLib | FranzGo
  level | info
  why | clearing assignment at end of group management session
}

"2022-09-22T21:43:18.462Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
{
  backoff | 2.5s
  consecutive_errors | 67
  err | GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group | infra_kafka-health-monitor
  InternalLib | FranzGo
  level | error
}

"2022-09-22T21:43:18.462Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
{
  InternalLib | FranzGo
  level | info
  why | waitmeta during join & sync error backoff
}

"2022-09-22T21:43:20.463Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:43:20.463Z","""i-0d9f68e9e771898f6""","joined"
{
  generation | 8
  group | infra_kafka-health-monitor
  instance_id | <nil>
  InternalLib | FranzGo
  leader | false
  level | info
  member_id | kgo-2e81183e-c97c-45f2-a341-f3831af2781c
}

"2022-09-22T21:43:20.464Z","""i-0d9f68e9e771898f6""","syncing"
{
  group | infra_kafka-health-monitor
  InternalLib | FranzGo
  level | info
  protocol | range
  protocol_type | consumer
}

"2022-09-22T21:43:20.464Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:43:20.464Z","""i-0d9f68e9e771898f6""","new group session begun"
{
  added |  
  group | infra_kafka-health-monitor
  InternalLib | FranzGo
  level | info
  lost |  
}

"2022-09-22T21:43:20.464Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:43:20.464Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:43:23.465Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:43:23.465Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:43:23.465Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:43:23.465Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:43:23.465Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:43:23.465Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
"2022-09-22T21:43:26.467Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:43:26.467Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:43:26.467Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:43:26.467Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:43:26.467Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:43:26.467Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:43:26.468Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:43:29.469Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:43:29.469Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:43:29.469Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:43:29.469Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:43:29.469Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:43:29.469Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:43:31.470Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:43:34.473Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:43:34.473Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:43:34.473Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:43:34.473Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:43:34.473Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:43:34.473Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:43:37.475Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:43:40.478Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:43:40.478Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:43:40.478Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:43:40.478Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:43:40.478Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:43:40.478Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"

Initially the host A continues working just fine:

"2022-09-22T21:43:16.845Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:18.846Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:21.849Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:23.851Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:26.853Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:28.855Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:31.856Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:33.857Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:36.858Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:38.860Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:43:41.862Z","""i-0ad5cca4acccd71d0""","Read"

Host A continues reading until 21:45:13 when it hits the TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.] error (the error from "Cannot fetch message"). It prints a ton here since above in the code it is in the for loop.

"2022-09-22T21:45:01.903Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:45:03.904Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:45:06.906Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:45:08.907Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:45:11.910Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
{
  error | TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.]
}

"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"
{
  InternalLib | FranzGo
  level | info
  why | fetch had inner topic errors: TOPIC_AUTHORIZATION_FAILED{ethos-heartbeat.franz-go.sasl[0]}
}

"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:13.911Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"
"2022-09-22T21:45:14.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:14.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:14.911Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"
"2022-09-22T21:45:14.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:14.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:45:14.911Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"

As mentioned before host B is printing the same logs in a loop

"2022-09-22T21:45:03.533Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:45:03.533Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:45:03.533Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:45:03.533Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:45:03.533Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:45:03.533Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:45:05.535Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:45:08.536Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:45:08.536Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:45:08.536Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:45:08.536Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:45:08.536Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:45:08.536Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:45:11.539Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:45:14.540Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:45:14.540Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:45:14.540Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:45:14.540Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:45:14.540Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:45:14.540Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"

And this pattern continues until 21:46:12 when it just stops

"2022-09-22T21:46:06.571Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:46:06.572Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:46:06.572Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:46:06.572Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:46:06.572Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:46:06.572Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:46:06.572Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"
"2022-09-22T21:46:09.573Z","""i-0d9f68e9e771898f6""","heartbeat errored"
"2022-09-22T21:46:09.573Z","""i-0d9f68e9e771898f6""","Partitions lost"
"2022-09-22T21:46:09.573Z","""i-0d9f68e9e771898f6""","Kill partitionReaders"
"2022-09-22T21:46:09.573Z","""i-0d9f68e9e771898f6""","assigning partitions"
"2022-09-22T21:46:09.573Z","""i-0d9f68e9e771898f6""","join and sync loop errored"
"2022-09-22T21:46:09.573Z","""i-0d9f68e9e771898f6""","immediate metadata update triggered"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","joining group"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","joined"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","syncing"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","synced"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","new group session begun"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","Partitions assigned"
"2022-09-22T21:46:12.575Z","""i-0d9f68e9e771898f6""","beginning heartbeat loop"

But host A continues to print the fetch errors:

"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:46:12.670Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"

and continues to do so until 21:49:46

"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","immediate metadata update triggered"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Cannot fetch message"
"2022-09-22T21:49:46.524Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:47.524Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:48.524Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:49.526Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:50.527Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:51.528Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:52.529Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:53.529Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:54.530Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:55.530Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:56.531Z","""i-0ad5cca4acccd71d0""","Read"
"2022-09-22T21:49:57.531Z","""i-0ad5cca4acccd71d0""","Read"

twmb · 2022-09-26T14:11:21Z

Could you add debug logging and then look for messages that say "sasl"?

It's not clear what's up with the logs above. AWS_MSK_IAM should indicate the credential session expiry when the connection is opened.

The client does not close connections when the response indicates an auth failure -- perhaps it should. That'd be a bit to wire in. The auth failure shouldn't really be happening, so I'm wondering if Amazon is unexpectedly expiring the session early for some reason.

ekeric13 · 2022-09-27T01:00:54Z

Redeployed with the new log level. I will post them when they occur. It seemed like they happened every 12 hours like clockwork.

This is what the generally read loop looks like for the host that isn't getting the new data:

"2022-09-26T18:40:17.358Z","""i-0f44bf4d18de40996""","heartbeating"
"2022-09-26T18:40:17.358Z","""i-0f44bf4d18de40996""","wrote Heartbeat v4"
{
  broker 5
  bytes_written 92
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 38.826µs
  write_wait 29.653µs
}
"2022-09-26T18:40:17.358Z","""i-0f44bf4d18de40996""","read Heartbeat v4"
{
  broker 5
  bytes_read 16
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 113.212µs
  time_to_read 426.607µs
}
"2022-09-26T18:40:17.358Z","""i-0f44bf4d18de40996""","heartbeat complete"
"2022-09-26T18:40:18.359Z","""i-0f44bf4d18de40996""","autocommitting"

And this is what the non-reading hosts boot looks like:

"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","Creating KafkaGoReader"
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","beginning autocommit loop"
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","immediate metadata update triggered"
{
  InternalLib FranzGo
  level info
  why client initialization
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","opening connection to broker"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","connection opened to broker"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","issuing api versions request"
{
  broker seed 0
  InternalLib FranzGo
  level debug
  version 3
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","wrote ApiVersions v3"
{
  broker seed 0
  bytes_written 30
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 16.133µs
  write_wait 7.83µs
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","read ApiVersions v3"
{
  broker seed 0
  bytes_read 366
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 24.245µs
  time_to_read 265.207µs
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","issuing SASLHandshakeRequest"
{
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","wrote SASLHandshake v1"
{
  broker seed 0
  bytes_written 30
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 13.211µs
  write_wait 20.182µs
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","read SASLHandshake v1"
{
  broker seed 0
  bytes_read 27
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 19.352µs
  time_to_read 491.612µs
}
"2022-09-26T18:15:17.272Z","""i-0f44bf4d18de40996""","beginning sasl authentication"
{
  authenticate true
  broker seed 0
  InternalLib FranzGo
  level debug
  mechanism AWS_MSK_IAM
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing SASLAuthenticate"
{
  broker seed 0
  InternalLib FranzGo
  level debug
  step 0
  version 2
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote SASLAuthenticate v2"
{
  broker seed 0
  bytes_written 471
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 26.357µs
  write_wait 20.962µs
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","Monitoring"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read SASLAuthenticate v2"
{
  broker seed 0
  bytes_read 98
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 43.939µs
  time_to_read 37.87093ms
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","sasl has a limited lifetime"
{
  broker seed 0
  InternalLib FranzGo
  level debug
  reauthenticate_in 11h59m57.468s
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","connection initialized successfully"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote Metadata v9"
{
  broker seed 0
  bytes_written 54
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 30.394µs
  write_wait 62.0026ms
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read Metadata v9"
{
  broker seed 0
  bytes_read 566
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 21.763µs
  time_to_read 3.94139ms
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","beginning to manage the group lifecycle"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","joining group"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","opening connection to broker"
{
  addr b-1.hostname.region.amazonaws.com:port
  broker 1
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","connection opened to broker"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing api versions request"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote ApiVersions v3"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read ApiVersions v3"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing SASLHandshakeRequest"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote SASLHandshake v1"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read SASLHandshake v1"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","beginning sasl authentication"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing SASLAuthenticate"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote SASLAuthenticate v2"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read SASLAuthenticate v2"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","sasl has a limited lifetime"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","connection initialized successfully"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote FindCoordinator v3"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read FindCoordinator v3"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","opening connection to broker"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","connection opened to broker"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing api versions request"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote ApiVersions v3"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read ApiVersions v3"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing SASLHandshakeRequest"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote SASLHandshake v1"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read SASLHandshake v1"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","beginning sasl authentication"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","issuing SASLAuthenticate"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote SASLAuthenticate v2"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read SASLAuthenticate v2"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","sasl has a limited lifetime"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","connection initialized successfully"
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote JoinGroup v7"
{
  broker 5
  bytes_written 115
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 18.817µs
  write_wait 17.761474ms
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","read JoinGroup v7"
{
  broker 5
  bytes_read 65
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 56.374µs
  time_to_read 2.082036ms
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","join returned MemberIDRequired, rejoining with response's   MemberID"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  member_id kgo-07e46971-0b43-4c3e-bee0-7c00182a2158
}
"2022-09-26T18:15:17.273Z","""i-0f44bf4d18de40996""","wrote JoinGroup v7"
{
  broker 5
  bytes_written 155
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 15.961µs
  write_wait 8.107µs
}
"2022-09-26T18:15:17.274Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:17.274Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:18.273Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:21.275Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:24.276Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:27.277Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:30.280Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:33.283Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:36.285Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:39.287Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:42.288Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:45.290Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:48.293Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:49.293Z","""i-0f44bf4d18de40996""","reaped connections"
{
  InternalLib FranzGo
  level debug
  num_reaped 2
  reap_dur 143.67µs
  time_since_last_reap 20.000243481s
}
"2022-09-26T18:15:51.294Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:53.295Z","""i-0f44bf4d18de40996""","read JoinGroup v7"
{
  broker 5
  bytes_read 121
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 34.059µs
  time_to_read 43.201689829s
}
"2022-09-26T18:15:53.295Z","""i-0f44bf4d18de40996""","joined"
{
  generation 10
  group infra_kafka-health-monitor
  instance_id <nil>
  InternalLib FranzGo
  leader false
  level info
  member_id kgo-07e46971-0b43-4c3e-bee0-7c00182a2158
}
"2022-09-26T18:15:53.295Z","""i-0f44bf4d18de40996""","syncing"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  protocol range
  protocol_type consumer
}
"2022-09-26T18:15:53.295Z","""i-0f44bf4d18de40996""","wrote SyncGroup v5"
"2022-09-26T18:15:54.295Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:15:57.297Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:00.299Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:03.300Z","""i-0f44bf4d18de40996""","read SyncGroup v5"
"2022-09-26T18:16:03.300Z","""i-0f44bf4d18de40996""","sync failed with RebalanceInProgress, rejoining"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:16:03.300Z","""i-0f44bf4d18de40996""","wrote JoinGroup v7"
"2022-09-26T18:16:03.300Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:06.302Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:09.303Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:12.305Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:15.307Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:18.309Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:21.311Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:24.313Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:27.314Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:30.317Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:33.319Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:36.320Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","read JoinGroup v7"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","joined"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","syncing"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","wrote SyncGroup v5"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","read SyncGroup v5"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","synced"
{
  assigned
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","new group session begun"
{
  added
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  lost
}
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","entering OnPartitionsAssigned"
{
  InternalLib FranzGo
  level debug
  with map[]
}
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","Partitions assigned"
"2022-09-26T18:16:38.322Z","""i-0f44bf4d18de40996""","beginning heartbeat loop"
"2022-09-26T18:16:39.322Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","heartbeating"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","opening connection to broker"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","connection opened to broker"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","issuing SASLHandshakeRequest"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","wrote SASLHandshake v1"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","read SASLHandshake v1"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","beginning sasl authentication"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","issuing SASLAuthenticate"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","wrote SASLAuthenticate v2"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","read SASLAuthenticate v2"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","sasl has a limited lifetime"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","connection initialized successfully"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","wrote Heartbeat v4"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","read Heartbeat v4"
"2022-09-26T18:16:41.323Z","""i-0f44bf4d18de40996""","heartbeat complete"
"2022-09-26T18:16:42.324Z","""i-0f44bf4d18de40996""","autocommitting"
"2022-09-26T18:16:44.327Z","""i-0f44bf4d18de40996""","heartbeating"
"2022-09-26T18:16:44.327Z","""i-0f44bf4d18de40996""","wrote Heartbeat v4"
"2022-09-26T18:16:44.327Z","""i-0f44bf4d18de40996""","read Heartbeat v4"
"2022-09-26T18:16:44.327Z","""i-0f44bf4d18de40996""","heartbeat complete"
"2022-09-26T18:16:45.327Z","""i-0f44bf4d18de40996""","autocommitting"

And the generally read loop for the host that is actively consuming from the one partition in the consumer group:

"2022-09-26T18:40:17.285Z","""i-0a9aed9aa14fdf11e""","heartbeating"
"2022-09-26T18:40:17.285Z","""i-0a9aed9aa14fdf11e""","wrote Heartbeat v4"
{
  broker 5
  bytes_written 92
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 32.109µs
  write_wait 21.862µs
}
"2022-09-26T18:40:17.285Z","""i-0a9aed9aa14fdf11e""","read Heartbeat v4"
{
  broker 5
  bytes_read 16
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 52.707µs
  time_to_read 520.773µs
}
"2022-09-26T18:40:17.285Z","""i-0a9aed9aa14fdf11e""","heartbeat complete"
"2022-09-26T18:40:17.286Z","""i-0a9aed9aa14fdf11e""","read Fetch v11"
{
  broker 1
  bytes_read 206
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 37.872µs
  time_to_read 2.562918564s
}
"2022-09-26T18:40:17.286Z","""i-0a9aed9aa14fdf11e""","updated uncommitted"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{343145=>343145=>343146}]
}
"2022-09-26T18:40:17.286Z","""i-0a9aed9aa14fdf11e""","Read"
{
  level debug
  Message {
    Headers [
      0 {
        Key UnixTime
        Value MTY2NDIyNDMxOQ==
      }
    ]
    LeaderEpoch 0
    Offset 343145
    Partition 0
    ProducerEpoch 0
    ProducerID 15015
    Timestamp 2022-09-26T20:31:59.077Z
    Topic heartbeat.franz-go.sasl
    Value MjAyMi0wOS0yNiAyMDozMTo1OQ==
  }
}
"2022-09-26T18:40:17.286Z","""i-0a9aed9aa14fdf11e""","wrote Fetch v11"
{
  broker 1
  bytes_written 115
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 34.953µs
  write_wait 21.968µs
  }
"2022-09-26T18:40:18.286Z","""i-0a9aed9aa14fdf11e""","autocommitting"
"2022-09-26T18:40:18.286Z","""i-0a9aed9aa14fdf11e""","issuing commit"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  uncommitted map[heartbeat.franz-go.sasl:map[0:{0 343145}]]
}
"2022-09-26T18:40:18.286Z","""i-0a9aed9aa14fdf11e""","wrote OffsetCommit v8"
{
  broker 5
  bytes_written 183
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 52.937µs
  write_wait 24.266µs
}
"2022-09-26T18:40:18.286Z","""i-0a9aed9aa14fdf11e""","read OffsetCommit v8"
{
  broker 5
  bytes_read 54
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 64.211µs
  time_to_read 3.334718ms
}
"2022-09-26T18:40:18.286Z","""i-0a9aed9aa14fdf11e""","updated committed"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{343144=>343145}]
}
"2022-09-26T18:40:19.286Z","""i-0a9aed9aa14fdf11e""","read Fetch v11"
"2022-09-26T18:40:19.286Z","""i-0a9aed9aa14fdf11e""","updated uncommitted"
"2022-09-26T18:40:19.286Z","""i-0a9aed9aa14fdf11e""","Read"
"2022-09-26T18:40:19.286Z","""i-0a9aed9aa14fdf11e""","wrote Fetch v11"
"2022-09-26T18:40:20.287Z","""i-0a9aed9aa14fdf11e""","heartbeating"
"2022-09-26T18:40:20.287Z","""i-0a9aed9aa14fdf11e""","wrote Heartbeat v4"
"2022-09-26T18:40:20.287Z","""i-0a9aed9aa14fdf11e""","read Heartbeat v4"
"2022-09-26T18:40:20.287Z","""i-0a9aed9aa14fdf11e""","heartbeat complete"
"2022-09-26T18:40:21.288Z","""i-0a9aed9aa14fdf11e""","autocommitting"
"2022-09-26T18:40:21.288Z","""i-0a9aed9aa14fdf11e""","issuing commit"
"2022-09-26T18:40:21.288Z","""i-0a9aed9aa14fdf11e""","wrote OffsetCommit v8"
"2022-09-26T18:40:21.288Z","""i-0a9aed9aa14fdf11e""","read OffsetCommit v8"
"2022-09-26T18:40:21.288Z","""i-0a9aed9aa14fdf11e""","updated committed"
"2022-09-26T18:40:22.289Z","""i-0a9aed9aa14fdf11e""","read Fetch v11"
"2022-09-26T18:40:22.289Z","""i-0a9aed9aa14fdf11e""","updated uncommitted"
"2022-09-26T18:40:22.289Z","""i-0a9aed9aa14fdf11e""","wrote Fetch v11"
"2022-09-26T18:40:22.289Z","""i-0a9aed9aa14fdf11e""","Read"

While the hosts that consumes from the consumer group boot sequence looks like this:

"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","Creating KafkaGoReader"
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","beginning autocommit loop"
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","immediate metadata update triggered"
{
  InternalLib FranzGo
  level info
  why client initialization
}
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","opening connection to broker"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","Monitoring"
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","issuing api versions request"
{
  broker seed 0
  InternalLib FranzGo
  level debug
  version 3
}
"2022-09-26T18:14:33.166Z","""i-0a9aed9aa14fdf11e""","wrote ApiVersions v3"
{
  broker seed 0
  bytes_written 30
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 13.26µs
  write_wait 8.59µs
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","read ApiVersions v3"
{
  broker seed 0
  bytes_read 366
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 30.47µs
  time_to_read 1.12664ms
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
{
  broker seed 0
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
{
  broker seed 0
  bytes_written 30
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 14.051µs
  write_wait 14.319µs
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
{
  broker seed 0
  bytes_read 27
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 19.075µs
  time_to_read 369.476µs
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
{
  authenticate true
  broker seed 0
  InternalLib FranzGo
  level debug
  mechanism AWS_MSK_IAM
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
{
  broker seed 0
  InternalLib FranzGo
  level debug
  step 0
  version 2
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
{
  broker seed 0
  bytes_written 471
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 18.971µs
  write_wait 15.975µs
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
{
  broker seed 0
  bytes_read 98
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 25.329µs
  time_to_read 32.674248ms
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
{
  broker seed 0
  InternalLib FranzGo
  level debug
  reauthenticate_in 11h59m57.469s
}
"2022-09-26T18:14:33.167Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker seed 0
  InternalLib FranzGo
  level debug

"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","wrote Metadata v9"
{
  broker seed 0
  bytes_written 54
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 14.815µs
  write_wait 82.228347ms
}
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","read Metadata v9"
{
  broker seed 0
  bytes_read 566
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 28.452µs
  time_to_read 2.321487ms
}
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","beginning to manage the group lifecycle"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","joining group"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","opening connection to broker"
{
  addr b-1.hostname.region.amazonaws.com:port
  broker 1
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","issuing api versions request"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","wrote ApiVersions v3"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","read ApiVersions v3"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
"2022-09-26T18:14:33.168Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","wrote FindCoordinator v3"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","read FindCoordinator v3"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","opening connection to broker"
{
  addr b-5.hostname.region.amazonaws.com:port
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","issuing api versions request"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","wrote ApiVersions v3"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","read ApiVersions v3"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
"2022-09-26T18:14:33.169Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","wrote JoinGroup v7"
{
  broker 5
  bytes_written 115
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 17.43µs
  write_wait 17.949794ms
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","read JoinGroup v7"
{
  broker 5
  bytes_read 65
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 42.776µs
  time_to_read 2.412668ms
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","join returned MemberIDRequired, rejoining with response's MemberID"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  member_id kgo-d044411b-d8b3-4015-a069-1699d488167f
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","wrote JoinGroup v7"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","autocommitting"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","read JoinGroup v7"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","joined"
{
  generation 9
  group infra_kafka-health-monitor
  instance_id <nil>
  InternalLib FranzGo
  leader false
  level info
  member_id kgo-d044411b-d8b3-4015-a069-1699d488167f
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","syncing"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  protocol range
  protocol_type consumer
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","wrote SyncGroup v5"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","read SyncGroup v5"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","synced"
{
  assigned heartbeat.franz-go.sasl[0]
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","new group session begun"
{
  added heartbeat.franz-go.sasl[0]
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  lost
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","beginning heartbeat loop"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","entering OnPartitionsAssigned"
{
  InternalLib FranzGo
  level debug
  with map[heartbeat.franz-go.sasl:[0]]
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","Partitions assigned"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","Start consuming"
{
  level debug
  Partition 0
  Topic heartbeat.franz-go.sasl
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","sharded request"
{
  destinations [5]
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","opening connection to broker"
{
  addr b-5.hostname.region.amazonaws.com:port
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
"2022-09-26T18:14:33.170Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote OffsetFetch v7"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read OffsetFetch v7"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","assigning partitions"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","assign requires loading offsets"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","metadata update triggered"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","opening connection to broker"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker 3
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","issuing api versions request"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote ApiVersions v3"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read ApiVersions v3"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote Metadata v9"
{
  broker 3
  bytes_written 54
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 16.29µs
  write_wait 21.821144ms
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read Metadata v9"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","metadata refresh has identical topic partition data"
{
  InternalLib FranzGo
  leader 1
  leader_epoch 0
  level debug
  partition 0
  topic heartbeat.franz-go.sasl
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","offsets to load broker"
{
  broker 1
  InternalLib FranzGo
  level debug
  load {map[] map[heartbeat.franz-go.sasl:map[0:{339847 e0 ce0}]]}}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote OffsetForLeaderEpoch v3"
{
  broker 1
  bytes_written 72
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 24.39µs
  write_wait 6.777µs
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read OffsetForLeaderEpoch v3"
{
  broker 1
  bytes_read 69
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 50.081µs
  time_to_read 4.03124ms
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","handled epoch results"
{
  broker 1
  InternalLib FranzGo
  level debug
  reloading map[]
  using map[heartbeat.franz-go.sasl:map[0:{339847 0}]]
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","opening connection to broker"
{
  addr b-1.hostname.region.amazonaws.com:port
  broker 1
  InternalLib FranzGo
  level debug
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote Fetch v11"
{
  broker 1
  bytes_written 115
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 17.119µs
  write_wait 16.649001ms
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","read Fetch v11"
{
  broker 1
  bytes_read 206
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 50.204µs
  time_to_read 2.627849ms
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","updated uncommitted"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{339847=>339847=>339848}]
}
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","Read"
"2022-09-26T18:14:33.171Z","""i-0a9aed9aa14fdf11e""","wrote Fetch v11"
"2022-09-26T18:14:34.171Z","""i-0a9aed9aa14fdf11e""","read Fetch v11"
"2022-09-26T18:14:34.171Z","""i-0a9aed9aa14fdf11e""","updated uncommitted"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{339848=>339848=>339849}]
}
"2022-09-26T18:14:34.171Z","""i-0a9aed9aa14fdf11e""","Read"

And every once in a while they both do something like this:

"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","connection opened to broker"
{
  addr b-4.hostname.region.amazonaws.com:port
  broker 4
  InternalLib FranzGo
  level debug
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","issuing SASLHandshakeRequest"
{
  broker 4
  InternalLib FranzGo
  level debug
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","wrote SASLHandshake v1"
{
  broker 4
  bytes_written 30
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 18.987µs
  write_wait 10.994µs
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","read SASLHandshake v1"
{
  broker 4
  bytes_read 27
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 21.266µs
  time_to_read 375.039µs
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","beginning sasl authentication"
{
  authenticate true
  broker 4
  InternalLib FranzGo
  level debug
  mechanism AWS_MSK_IAM
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","issuing SASLAuthenticate"
{
  broker 4
  InternalLib FranzGo
  level debug
  step 0
  version 2
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","wrote SASLAuthenticate v2"
{
  broker 4
  bytes_written 471
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 24.926µs
  write_wait 4.179µs
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","read SASLAuthenticate v2"
{
  broker 4
  bytes_read 98
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 18.544µs
  time_to_read 38.373128ms
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","sasl has a limited lifetime"
{
  broker 4
  InternalLib FranzGo
  level debug
  reauthenticate_in 11h59m57.463s
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","connection initialized successfully"
{
  addr b-4.hostname.region.amazonaws.com:port
  broker 4
  InternalLib FranzGo
  level debug
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","wrote Metadata v9"
{
  broker 4
  bytes_written 54
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 26.303µs
  write_wait 75.725951ms
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","read Metadata v9"
{
  broker 4
  bytes_read 566
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 43.269µs
  time_to_read 13.40057ms
}
"2022-09-26T20:30:12.353Z","""i-0a9aed9aa14fdf11e""","metadata refresh has identical topic partition data"
{
  InternalLib FranzGo
  leader 1
  leader_epoch 0
  level debug
  partition 0
}

As you can see the hosts that does read "synced" first, and actually has a partition in the map for the log of "entering OnPartitionsAssigned".

ekeric13 · 2022-09-27T18:57:59Z

Okay so here are the logs of the error with debug mode enabled within franz-go.
I have yet to see the "TOPIC_AUTHORIZATION_FAILED" error interestingly, only have gotten the "GROUP_AUTHORIZATION_FAILED" error and it only occurred on staging and not prod.

One thing that is different though is it never fell into the infinite loop with the errors, just on the revoking of old partitions and assigning of new partitions, so there was a lot less error logs and resolved after 20 seconds:

As expected, the errors happened when it was time to re-authenticate the sasl connection.
First of all here is the logs of the non-consuming client during the period of error logs:

"2022-09-27T08:34:36.426Z","""i-0bbeff57294d5f807""","autocommitting"
"2022-09-27T08:34:36.426Z","""i-0bbeff57294d5f807""","heartbeating"
"2022-09-27T08:34:36.426Z","""i-0bbeff57294d5f807""","wrote Heartbeat v4"
"2022-09-27T08:34:36.426Z","""i-0bbeff57294d5f807""","read Heartbeat v4"
"2022-09-27T08:34:36.426Z","""i-0bbeff57294d5f807""","heartbeat complete"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","autocommitting"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","heartbeating"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote Heartbeat v4"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read Heartbeat v4"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","heartbeat complete"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","heartbeat errored"
{
  err REBALANCE_IN_PROGRESS: The group is rebalancing, so a rejoin is needed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","immediate metadata update triggered"
{
  InternalLib FranzGo
  level info
  why waitmeta after heartbeat error
}
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","assigning partitions"
{
  how unassigning everything
  input
  InternalLib FranzGo
  level info
  why revoking all assignments because we are not cooperative
}
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","eager consumer revoking prior assigned partitions"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  revoking map[]
}
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","entering OnPartitionsRevoked"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","Partitions revoked"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","Kill partitionReaders"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote Metadata v9"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read Metadata v9"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","immediate metadata update had inner errors, re-updating"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","opening connection to broker"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","connection opened to broker"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote SASLHandshake v1"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read SASLHandshake v1"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","beginning sasl authentication"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","issuing SASLAuthenticate"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read SASLAuthenticate v2"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","sasl has a limited lifetime"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","connection initialized successfully"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote Metadata v9"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read Metadata v9"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","metadata refresh has identical topic partition data"
{
  InternalLib FranzGo
  leader 2
  leader_epoch 0
  level debug
  partition 0
  topic heartbeat.franz-go.sasl
}
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","joining group"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","opening connection to broker"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","connection opened to broker"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote SASLHandshake v1"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read SASLHandshake v1"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","beginning sasl authentication"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","issuing SASLAuthenticate"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","read SASLAuthenticate v2"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","sasl has a limited lifetime"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","connection initialized successfully"
"2022-09-27T08:34:39.428Z","""i-0bbeff57294d5f807""","wrote JoinGroup v7"
"2022-09-27T08:34:42.430Z","""i-0bbeff57294d5f807""","autocommitting"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","read JoinGroup v7"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","joined"
{
  generation 29
  group infra_kafka-health-monitor
  instance_id <nil>
  InternalLib FranzGo
  leader false
  level info
  member_id kgo-5172881b-5211-4dcf-b220-bf58b9ef6d3c
}
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","syncing"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","wrote SyncGroup v5"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","read SyncGroup v5"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","synced"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","new group session begun"
{
  added
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  lost
}
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","entering OnPartitionsAssigned"
{
  InternalLib FranzGo
  level debug
  with map[]
}
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","Partitions assigned"
"2022-09-27T08:34:43.430Z","""i-0bbeff57294d5f807""","beginning heartbeat loop"
"2022-09-27T08:34:45.431Z","""i-0bbeff57294d5f807""","autocommitting"
"2022-09-27T08:34:46.432Z","""i-0bbeff57294d5f807""","heartbeating"
"2022-09-27T08:34:46.432Z","""i-0bbeff57294d5f807""","wrote Heartbeat v4"
"2022-09-27T08:34:46.432Z","""i-0bbeff57294d5f807""","read Heartbeat v4"
"2022-09-27T08:34:46.432Z","""i-0bbeff57294d5f807""","heartbeat complete"
"2022-09-27T08:34:48.433Z","""i-0bbeff57294d5f807""","autocommitting"
"2022-09-27T08:34:49.434Z","""i-0bbeff57294d5f807""","heartbeating"
"2022-09-27T08:34:49.434Z","""i-0bbeff57294d5f807""","wrote Heartbeat v4"
"2022-09-27T08:34:49.434Z","""i-0bbeff57294d5f807""","read Heartbeat v4"
"2022-09-27T08:34:49.434Z","""i-0bbeff57294d5f807""","heartbeat complete"
"2022-09-27T08:34:51.434Z","""i-0bbeff57294d5f807""","autocommitting"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","heartbeating"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","sasl expiry limit reached, reauthenticating"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","wrote SASLHandshake v1"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","read SASLHandshake v1"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","beginning sasl authentication"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","issuing SASLAuthenticate"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","read SASLAuthenticate v2"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","sasl has a limited lifetime"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","wrote Heartbeat v4"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","read Heartbeat v4"
"2022-09-27T08:34:52.435Z","""i-0bbeff57294d5f807""","heartbeat complete"

and here are the logs of the container that is actively reading:

"2022-09-27T08:34:36.194Z","""i-08974ff58fb344af6""","heartbeating"
"2022-09-27T08:34:36.194Z","""i-08974ff58fb344af6""","wrote Heartbeat v4"
"2022-09-27T08:34:36.194Z","""i-08974ff58fb344af6""","read Heartbeat v4"
"2022-09-27T08:34:36.194Z","""i-08974ff58fb344af6""","heartbeat complete"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","read Fetch v11"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","updated uncommitted"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","Read"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","wrote Fetch v11"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","autocommitting"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","read Fetch v11"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","updated uncommitted"
"2022-09-27T08:34:37.194Z","""i-08974ff58fb344af6""","wrote Fetch v11"
"2022-09-27T08:34:38.195Z","""i-08974ff58fb344af6""","Read"
"2022-09-27T08:34:39.196Z","""i-08974ff58fb344af6""","heartbeating"
"2022-09-27T08:34:39.196Z","""i-08974ff58fb344af6""","wrote Heartbeat v4"
"2022-09-27T08:34:39.196Z","""i-08974ff58fb344af6""","read Heartbeat v4"
"2022-09-27T08:34:39.196Z","""i-08974ff58fb344af6""","heartbeat complete"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","heartbeat errored"
{
  err REBALANCE_IN_PROGRESS: The group is rebalancing, so a rejoin is needed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","immediate metadata update triggered"
{
  InternalLib FranzGo
  level info
  why waitmeta after heartbeat error
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","opening connection to broker"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","assigning partitions"
{
  how unassigning everything
  input
  InternalLib FranzGo
  level info
  why revoking all assignments because we are not cooperative
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","eager consumer revoking prior assigned partitions"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  revoking map[heartbeat.franz-go.sasl:[0]]
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","entering OnPartitionsRevoked"
{
  InternalLib FranzGo
  level debug
  with map[heartbeat.franz-go.sasl:[0]]
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","Partitions revoked"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","Kill partitionReaders"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","Closing partitionReader"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","Waiting for work to finish"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","Quit Reading"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","in CommitOffsetsSync"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  with map[heartbeat.franz-go.sasl:map[0:{0 361259}]]
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","issuing commit"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","wrote OffsetCommit v8"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","read Fetch v11"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","read OffsetCommit v8"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","updated committed"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{361257=>361259}]
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","left CommitOffsetsSync"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","connection opened to broker"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","wrote SASLHandshake v1"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","read SASLHandshake v1"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","beginning sasl authentication"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","issuing SASLAuthenticate"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","read SASLAuthenticate v2"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","sasl has a limited lifetime"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","connection initialized successfully"
{
  addr b-1.hostname.region.amazonaws.com:port
  broker 1
  InternalLib FranzGo
  level debug
}
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","wrote Metadata v9"
"2022-09-27T08:34:39.197Z","""i-08974ff58fb344af6""","read Metadata v9"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","metadata refresh has identical topic partition data"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","joining group"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","opening connection to broker"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","connection opened to broker"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","wrote SASLHandshake v1"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","read SASLHandshake v1"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","beginning sasl authentication"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","issuing SASLAuthenticate"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","read SASLAuthenticate v2"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","sasl has a limited lifetime"
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","connection initialized successfully"
{
  addr b-2.hostname.region.amazonaws.com:port
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-27T08:34:39.198Z","""i-08974ff58fb344af6""","wrote JoinGroup v7"
"2022-09-27T08:34:40.199Z","""i-08974ff58fb344af6""","autocommitting"
"2022-09-27T08:34:43.200Z","""i-08974ff58fb344af6""","read JoinGroup v7"
"2022-09-27T08:34:43.200Z","""i-08974ff58fb344af6""","joined"
{
  generation 29
  group infra_kafka-health-monitor
  instance_id <nil>
  InternalLib FranzGo
  leader false
  level info
  member_id kgo-e58bdd10-9a21-4279-83e2-d57beb3ade16
}
"2022-09-27T08:34:43.200Z","""i-08974ff58fb344af6""","syncing"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote SyncGroup v5"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read SyncGroup v5"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","synced"
{
  assigned heartbeat.franz-go.sasl[0]
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","new group session begun"
{
  added
  heartbeat.franz-go.sasl[0]
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
  lost
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","beginning heartbeat loop"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","entering OnPartitionsAssigned"
{
  InternalLib FranzGo
  level debug
  with map[heartbeat.franz-go.sasl:[0]]
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Partitions assigned"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Start consuming"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","sharded request"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote OffsetFetch v7"
{
  broker 2
  bytes_written 84
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 30.783µs
  write_wait 27.13µs
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read OffsetFetch v7"
{
  broker 2
  bytes_read 17
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 37.036µs
  time_to_read 31.807164ms
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","fetch offsets failed with non-retriable error"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","heartbeat errored"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","entering OnPartitionsLost"
{
  InternalLib FranzGo
  level debug
  with map[heartbeat.franz-go.sasl:[0]]
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Partitions lost"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Kill partitionReaders"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Closing partitionReader"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Waiting for work to finish"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","Quit Reading"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","assigning partitions"
{
  how unassigning everything
  input
  InternalLib FranzGo
  level info
  why clearing assignment at end of group management session
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","join and sync loop errored"
{
  backoff 247.48191ms
  consecutive_errors 1
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","immediate metadata update triggered"
{
  InternalLib FranzGo
  level info
  why waitmeta during join & sync error backoff
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","opening connection to broker"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","connection opened to broker"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote SASLHandshake v1"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read SASLHandshake v1"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","beginning sasl authentication"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","issuing SASLAuthenticate"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read SASLAuthenticate v2"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","sasl has a limited lifetime"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","connection initialized successfully"
{
  addr b-3.hostname.region.amazonaws.com:port
  broker 3
  InternalLib FranzGo
  level debug
}
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote Metadata v9"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read Metadata v9"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","metadata refresh has identical topic partition data"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","joining group"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote JoinGroup v7"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read JoinGroup v7"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","joined"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","syncing"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","wrote SyncGroup v5"
"2022-09-27T08:34:43.201Z","""i-08974ff58fb344af6""","read SyncGroup v5"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","synced"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","new group session begun"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","beginning heartbeat loop"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","entering OnPartitionsAssigned"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Partitions assigned"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","sharded request"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Start consuming"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","wrote OffsetFetch v7"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","read OffsetFetch v7"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","fetch offsets failed with non-retriable error"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","heartbeat errored"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","entering OnPartitionsLost"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Partitions lost"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Kill partitionReaders"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Closing partitionReader"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Waiting for work to finish"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","Quit Reading"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","assigning partitions"
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","join and sync loop errored"
{
  backoff 541.054974ms
  consecutive_errors 2
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:43.202Z","""i-08974ff58fb344af6""","autocommitting"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","joining group"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","wrote JoinGroup v7"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","read JoinGroup v7"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","joined"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","syncing"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","wrote SyncGroup v5"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","read SyncGroup v5"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","synced"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","new group session begun"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","entering OnPartitionsAssigned"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","Partitions assigned"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","sharded request"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","wrote OffsetFetch v7"
"2022-09-27T08:34:44.201Z","""i-08974ff58fb344af6""","beginning heartbeat loop"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","Start consuming"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","read OffsetFetch v7"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","fetch offsets failed with non-retriable error"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","heartbeat errored"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","entering OnPartitionsLost"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","Partitions lost"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","Kill partitionReaders"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","Closing partitionReader"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","Waiting for work to finish"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","Quit Reading"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","assigning partitions"
"2022-09-27T08:34:44.202Z","""i-08974ff58fb344af6""","join and sync loop errored"
{
  backoff 1.171827913s
  consecutive_errors 3
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","joining group"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","wrote JoinGroup v7"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","read JoinGroup v7"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","joined"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","syncing"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","wrote SyncGroup v5"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","read SyncGroup v5"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","synced"
{
  assigned heartbeat.franz-go.sasl[0]
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","new group session begun"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","entering OnPartitionsAssigned"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Partitions assigned"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","sharded request"
{
  destinations [2]
  InternalLib FranzGo
  level debug
}
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Start consuming"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","beginning heartbeat loop"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","wrote OffsetFetch v7"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","read OffsetFetch v7"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","fetch offsets failed with non-retriable error"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","heartbeat errored"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","entering OnPartitionsLost"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Partitions lost"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Kill partitionReaders"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Closing partitionReader"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Waiting for work to finish"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","Quit Reading"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","assigning partitions"
"2022-09-27T08:34:45.203Z","""i-08974ff58fb344af6""","join and sync loop errored"
{
  backoff 1.990050154s
  consecutive_errors 4
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:46.204Z","""i-08974ff58fb344af6""","autocommitting"
"2022-09-27T08:34:47.204Z","""i-08974ff58fb344af6""","joining group"
"2022-09-27T08:34:47.204Z","""i-08974ff58fb344af6""","wrote JoinGroup v7"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","read JoinGroup v7"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","joined"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","syncing"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","wrote SyncGroup v5"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","read SyncGroup v5"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","synced"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","new group session begun"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","entering OnPartitionsAssigned"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Partitions assigned"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","beginning heartbeat loop"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Start consuming"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","sharded request"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","wrote OffsetFetch v7"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","read OffsetFetch v7"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","fetch offsets failed with non-retriable error"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","heartbeat errored"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","entering OnPartitionsLost"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Partitions lost"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Kill partitionReaders"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Closing partitionReader"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Waiting for work to finish"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","Quit Reading"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","assigning partitions"
{
  how unassigning everything
  input
  InternalLib FranzGo
  level info
  why clearing assignment at end of group management session
}
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","join and sync loop errored"
{
  backoff 2.5s
  consecutive_errors 5
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","immediate metadata update triggered"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","wrote Metadata v9"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","read Metadata v9"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","immediate metadata update had inner errors, re-updating"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","wrote Metadata v9"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","read Metadata v9"
"2022-09-27T08:34:47.205Z","""i-08974ff58fb344af6""","metadata refresh has identical topic partition data"
"2022-09-27T08:34:49.206Z","""i-08974ff58fb344af6""","autocommitting"
"2022-09-27T08:34:50.206Z","""i-08974ff58fb344af6""","joining group"
"2022-09-27T08:34:50.206Z","""i-08974ff58fb344af6""","wrote JoinGroup v7"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read JoinGroup v7"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","joined"
{
  generation 29
  group infra_kafka-health-monitor
  instance_id <nil>
  InternalLib FranzGo
  leader false
  level info
  member_id kgo-e58bdd10-9a21-4279-83e2-d57beb3ade16
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","syncing"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote SyncGroup v5"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read SyncGroup v5"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","synced"
{
  assigned heartbeat.franz-go.sasl[0]
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","new group session begun"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","entering OnPartitionsAssigned"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","Partitions assigned"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","Start consuming"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","beginning heartbeat loop"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","sharded request"
{
  destinations [2]
  InternalLib FranzGo
  level debug
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","sasl expiry limit reached, reauthenticating"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote SASLHandshake v1"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read SASLHandshake v1"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","beginning sasl authentication"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","issuing SASLAuthenticate"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read SASLAuthenticate v2"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","sasl has a limited lifetime"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote OffsetFetch v7"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read OffsetFetch v7"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","assigning partitions"
{
  how assigning everything new, keeping current assignment
  input
  heartbeat.franz-go.sasl[0{361259 e0 ce0}]
  InternalLib FranzGo
  level info
  why newly fetched offsets for group infra_kafka-health-monitor
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","assign requires loading offsets"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","offsets to load broker"
{
  broker 2
  InternalLib FranzGo
  level debug
  load {map[] map[heartbeat.franz-go.sasl:map[0:{361259 e0 ce0}]]}
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote OffsetForLeaderEpoch v3"
{
  broker 2
  bytes_written 72
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 41.526µs
  write_wait 19.653µs
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read OffsetForLeaderEpoch v3"
{
  broker 2
  bytes_read 69
  err <nil>
  InternalLib FranzGo
  level debug
  read_wait 37.249µs
  time_to_read 87.765445ms
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","handled epoch results"
{
  broker 2
  InternalLib FranzGo
  level debug
  reloading map[]
  using map[heartbeat.franz-go.sasl:map[0:{361259 0}]]
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","opening connection to broker"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","connection opened to broker"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","issuing SASLHandshakeRequest"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote SASLHandshake v1"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read SASLHandshake v1"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","beginning sasl authentication"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","issuing SASLAuthenticate"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote SASLAuthenticate v2"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read SASLAuthenticate v2"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","sasl has a limited lifetime"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","connection initialized successfully"
{
  addr b-2.hostname.region.amazonaws.com:port
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote Fetch v11"
{
  broker 2
  bytes_written 115
  err <nil>
  InternalLib FranzGo
  level debug
  time_to_write 40.792µs
  write_wait 22.318233ms
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","read Fetch v11"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","updated uncommitted"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{361259=>361259=>361263}]
}
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","wrote Fetch v11"
"2022-09-27T08:34:50.207Z","""i-08974ff58fb344af6""","Read"
"2022-09-27T08:34:51.207Z","""i-08974ff58fb344af6""","Read"
"2022-09-27T08:34:52.207Z","""i-08974ff58fb344af6""","read Fetch v11"
"2022-09-27T08:34:52.207Z","""i-08974ff58fb344af6""","updated uncommitted"
{
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
  to heartbeat.franz-go.sasl[0{361259=>361263=>361264}]
}
"2022-09-27T08:34:52.207Z","""i-08974ff58fb344af6""","wrote Fetch v11"
"2022-09-27T08:34:52.207Z","""i-08974ff58fb344af6""","Read"
"2022-09-27T08:34:52.207Z","""i-08974ff58fb344af6""","autocommitting"
"2022-09-27T08:34:52.208Z","""i-08974ff58fb344af6""","read Fetch v11"
"2022-09-27T08:34:52.208Z","""i-08974ff58fb344af6""","updated uncommitted"
"2022-09-27T08:34:52.208Z","""i-08974ff58fb344af6""","wrote Fetch v11"
"2022-09-27T08:34:53.209Z","""i-08974ff58fb344af6""","heartbeating"
"2022-09-27T08:34:53.209Z","""i-08974ff58fb344af6""","wrote Heartbeat v4"
"2022-09-27T08:34:53.209Z","""i-08974ff58fb344af6""","read Heartbeat v4"
"2022-09-27T08:34:53.209Z","""i-08974ff58fb344af6""","heartbeat complete"
"2022-09-27T08:34:53.209Z","""i-08974ff58fb344af6""","Read"

Maybe every time there needs to be a sasl re-authentication a re-balance happens?
I think I mentioned that I have other clients using the same consumer group.

Have any ideas on how to avoid this bug?
If not, do you think next debugging steps should be to remove the other clients from the consumer group and just give franz-go its own? or to only have 1 consumer in a consumer_group that is consuming from a topic with only 1 partition (as opposed to two consumers)?

twmb · 2022-09-29T13:43:22Z

It looks like you're not adding the full args on every line -- particularly the "reauthenticate_in" lines -- those are a bit important to the investigation as well
The container is reading fails at 08:34:36, but logs only start at 08:34:36 -- I'd like to know what the logs said 12hr prior (i.e. is the internal timing off and things didn't reauthenticate fast enough)
If it's not too much, I think segmentio/kafka-go now supports AWS_MSK_IAM. Could you try that client and see if the behavior reproduces? Although, I don't think they support reauth right now.
From what I can tell, this just shouldn't be happening. Rebalances are unrelated to reauth.

ekeric13 · 2022-09-30T00:38:38Z

So far I continue to not get the TOPIC_AUTHORIZATION_FAILED as an error log but I can see it now as a debug log.

It looks like you're not adding the full args on every line -- particularly the "reauthenticate_in" lines -- those are a bit important to the investigation as well

I wanted to reduce clutter since there were already a lot of logs to grok. Did you want to make sure each broker was re-authenticating? The logs are always like {reauthenticate_in: 11h59m57.XXXs, broker: X}

The container is reading fails at 08:34:36, but logs only start at 08:34:36 -- I'd like to know what the logs said 12hr prior (i.e. is the internal timing off and things didn't reauthenticate fast enough)

I don't post the logs before since there are thousands of small read loops

"heartbeating"
"wrote Heartbeat v4"
"read Heartbeat v4"
"heartbeat complete"
"autocommitting"

I think you are onto something though.
So in production every 12 hours I re-authenticate:

"2022-09-28T08:32:11.719Z","""i-0ad5cca4acccd71d0""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-28T08:32:52.064Z","""i-0d9f68e9e771898f6""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-28T20:32:10.770Z","""i-0ad5cca4acccd71d0""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-28T20:32:49.870Z","""i-0d9f68e9e771898f6""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-29T08:32:08.798Z","""i-0ad5cca4acccd71d0""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-29T08:32:49.301Z","""i-0d9f68e9e771898f6""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-29T20:32:08.234Z","""i-0ad5cca4acccd71d0""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}
"2022-09-29T20:32:47.103Z","""i-0d9f68e9e771898f6""","sasl expiry limit reached, reauthenticating"
{
  broker 5
  InternalLib FranzGo
  level debug
}

In staging it is much more random (i-0bbeff57294d5f807 is the consuming host while the other two hosts are both idle given the topic only has 1 partition):

"2022-09-28T01:11:38.153Z","""i-0bbeff57294d5f807""","sasl expiry limit reached, reauthenticating"
{
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-28T01:11:45.193Z","""i-0774b6e139d005854""","sasl expiry limit reached, reauthenticating"
{
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-28T13:11:41.322Z","""i-0bbeff57294d5f807""","sasl expiry limit reached, reauthenticating"
{
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-29T01:11:42.143Z","""i-0bbeff57294d5f807""","sasl expiry limit reached, reauthenticating"
{
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-29T04:00:00.471Z","""i-0889b0b6689649b79""","sasl expiry limit reached, reauthenticating"
{
  broker 2
  InternalLib FranzGo
  level debug
}
"2022-09-29T13:11:44.636Z","""i-0bbeff57294d5f807""","sasl expiry limit reached, reauthenticating"
{
  broker 2
  InternalLib FranzGo
  level debug
}

Both environments get the GROUP_AUTHORIZATION btw. Let me post more logs from the staging one since that is simpler to follow as it has 3 brokers as opposed to 6.
So for staging broker 2 seems to be the leader as that is where my code communicates to when it needs to when it does these actions:

read OffsetFetch v7
read SyncGroup v5
read JoinGroup v7
read OffsetForLeaderEpoch v3

Over the course of 2 days the authentication sequence happens 395 times for brokers 1 and 3. Only 39 times for broker 2. But still enough to re-authenticate every 12 hours.

I will post logs for what this looks like in a following comment along with more explicit logs leading up to broker 2 authentication sequence.

If it's not too much, I think segmentio/kafka-go now supports AWS_MSK_IAM. Could you try that client and see if the behavior reproduces? Although, I don't think they support reauth right now.

I think I mentioned in another issue I was debugging all the popular go kafka clients together and having them write heartbeat messages and then read back those messages. Also all are using the same consumer group.
#199 (comment)

There are no errors for reading from a topic in kafka-go. But there are errors when writing to a topic in kafka-go (only when I am doing sasl AWS_MKS_IAM): segmentio/kafka-go#992

Do you have a discord? I don't mind sending a large attachment of logs in a DM over there

ekeric13 · 2022-09-30T01:15:39Z

A couple hours of logs around sasl re-authentication... marking the broker:

"2022-09-28T00:14:24.998Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T00:14:24.998Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T00:19:59.729Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T00:19:59.729Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T00:24:24.401Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T00:24:24.401Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T00:24:59.329Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T00:24:59.329Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T00:29:24.810Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T00:29:24.810Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T00:34:59.721Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T00:34:59.721Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T00:39:24.390Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T00:39:24.390Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T00:39:59.922Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T00:39:59.922Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T00:44:24.613Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T00:44:24.613Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T00:49:59.317Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T00:49:59.317Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T00:54:25.132Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T00:54:25.132Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T00:54:59.523Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T00:54:59.523Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T00:59:24.534Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T00:59:24.534Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T01:04:59.927Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T01:04:59.927Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T01:07:16.947Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T01:07:16.947Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T01:07:16.947Z","""i-0bbeff57294d5f807""","""2""","beginning sasl authentication"
"2022-09-28T01:07:16.947Z","""i-0bbeff57294d5f807""","""2""","connection initialized successfully"
"2022-09-28T01:07:17.011Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T01:07:17.011Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T01:07:17.011Z","""i-0774b6e139d005854""","""2""","beginning sasl authentication"
"2022-09-28T01:07:17.011Z","""i-0774b6e139d005854""","""2""","connection initialized successfully"
"2022-09-28T01:07:29.956Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T01:07:29.956Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T01:08:44.064Z","""i-0774b6e139d005854""","""2""","beginning sasl authentication"
"2022-09-28T01:08:44.064Z","""i-0774b6e139d005854""","""2""","connection initialized successfully"
"2022-09-28T01:08:44.065Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T01:08:44.065Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T01:08:47.066Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T01:08:47.066Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T01:11:38.153Z","""i-0bbeff57294d5f807""","""2""","beginning sasl authentication"
"2022-09-28T01:11:39.155Z","""i-0bbeff57294d5f807""","""2""","beginning sasl authentication"
"2022-09-28T01:11:39.155Z","""i-0bbeff57294d5f807""","""2""","connection initialized successfully"
"2022-09-28T01:11:45.193Z","""i-0774b6e139d005854""","""2""","beginning sasl authentication"
"2022-09-28T01:13:23.217Z","""i-0bbeff57294d5f807""","""2""","beginning sasl authentication"
"2022-09-28T01:13:23.217Z","""i-0bbeff57294d5f807""","""2""","connection initialized successfully"
"2022-09-28T01:14:05.244Z","""i-0bbeff57294d5f807""","""2""","beginning sasl authentication"
"2022-09-28T01:14:05.244Z","""i-0bbeff57294d5f807""","""2""","connection initialized successfully"
"2022-09-28T01:14:25.256Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T01:14:25.257Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T01:19:59.524Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T01:19:59.524Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T01:24:24.649Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T01:24:24.649Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T01:24:59.742Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T01:24:59.742Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T01:29:24.620Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T01:29:24.620Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T01:34:59.145Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T01:35:00.145Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T01:39:25.040Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T01:39:25.041Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T01:39:59.333Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T01:39:59.333Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T01:44:25.265Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T01:44:25.266Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T01:49:59.748Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T01:49:59.748Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T01:54:24.714Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T01:54:24.714Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T01:54:59.952Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T01:54:59.952Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T01:59:24.903Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T01:59:24.903Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T02:04:59.360Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T02:04:59.360Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T02:09:24.400Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T02:09:24.400Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T02:09:59.558Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T02:09:59.558Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T02:14:24.757Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T02:14:24.758Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T02:19:59.963Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T02:19:59.963Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T02:24:24.343Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T02:24:25.343Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T02:24:59.167Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T02:25:00.168Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T02:29:24.567Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T02:29:24.567Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T02:34:59.561Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T02:34:59.561Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T02:39:24.730Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T02:39:24.730Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T02:39:59.760Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T02:39:59.760Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T02:44:24.925Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T02:44:24.925Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T02:49:59.171Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T02:50:00.172Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T02:54:24.463Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T02:54:24.463Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T02:54:59.359Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T02:54:59.359Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T02:59:24.688Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T02:59:24.688Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T03:04:59.753Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T03:04:59.753Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T03:09:25.086Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T03:09:25.086Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T03:09:59.369Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T03:09:59.369Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T03:14:25.299Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T03:14:25.299Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T03:19:59.785Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T03:19:59.785Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T03:24:24.749Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T03:24:24.749Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"
"2022-09-28T03:24:59.976Z","""i-0774b6e139d005854""","""1""","beginning sasl authentication"
"2022-09-28T03:24:59.976Z","""i-0774b6e139d005854""","""1""","connection initialized successfully"
"2022-09-28T03:29:25.008Z","""i-0bbeff57294d5f807""","""1""","beginning sasl authentication"
"2022-09-28T03:29:25.008Z","""i-0bbeff57294d5f807""","""1""","connection initialized successfully"
"2022-09-28T03:34:59.383Z","""i-0774b6e139d005854""","""3""","beginning sasl authentication"
"2022-09-28T03:34:59.383Z","""i-0774b6e139d005854""","""3""","connection initialized successfully"
"2022-09-28T03:39:24.466Z","""i-0bbeff57294d5f807""","""3""","beginning sasl authentication"
"2022-09-28T03:39:24.466Z","""i-0bbeff57294d5f807""","""3""","connection initialized successfully"

And more explicit logs of broker 2 sequence when it fails:

"2022-09-29T01:08:36.921Z","no broker field","heartbeat complete"
"2022-09-29T01:08:36.921Z","no broker field","autocommitting"
"2022-09-29T01:08:39.923Z","no broker field","heartbeating"
"2022-09-29T01:08:39.923Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:08:39.923Z","""2""","read Heartbeat v4"
"2022-09-29T01:08:39.923Z","no broker field","heartbeat complete"
"2022-09-29T01:08:39.923Z","no broker field","autocommitting"
"2022-09-29T01:08:42.924Z","no broker field","heartbeating"
"2022-09-29T01:08:42.924Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:08:42.924Z","""2""","read Heartbeat v4"
"2022-09-29T01:08:42.924Z","no broker field","heartbeat complete"
"2022-09-29T01:08:42.924Z","no broker field","autocommitting"
"2022-09-29T01:08:45.927Z","no broker field","heartbeating"
"2022-09-29T01:08:45.927Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:08:45.927Z","""2""","read Heartbeat v4"
"2022-09-29T01:08:45.927Z","no broker field","heartbeat complete"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level debug
}
"2022-09-29T01:08:45.927Z","no broker field","heartbeat errored"
{
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-29T01:08:45.927Z","no broker field","entering OnPartitionsLost"
"2022-09-29T01:08:45.927Z","no broker field","Partitions lost"
"2022-09-29T01:08:45.927Z","no broker field","Kill partitionReaders"
"2022-09-29T01:08:45.927Z","no broker field","assigning partitions"
"2022-09-29T01:08:45.927Z","no broker field","join and sync loop errored"
{
  backoff 238.844873ms
  consecutive_errors 1
  err GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed.
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level error
}
"2022-09-29T01:08:45.927Z","no broker field","immediate metadata update triggered"
{
  InternalLib FranzGo
  level info
  why waitmeta during join & sync error backoff
}
"2022-09-29T01:08:45.927Z","""2""","wrote Metadata v9"
"2022-09-29T01:08:45.927Z","no broker field","autocommitting"
"2022-09-29T01:08:45.927Z","""2""","read Metadata v9"
"2022-09-29T01:08:45.927Z","no broker field","immediate metadata update had inner errors, re-updating"
{
  errors TOPIC_AUTHORIZATION_FAILED{heartbeat.franz-go.sasl}
  InternalLib FranzGo
  level debug
  update_after 250ms
}
"2022-09-29T01:08:45.927Z","no broker field","joining group"
"2022-09-29T01:08:45.927Z","""2""","opening connection to broker"
"2022-09-29T01:08:45.927Z","""2""","connection opened to broker"
"2022-09-29T01:08:45.927Z","""2""","issuing SASLHandshakeRequest"
"2022-09-29T01:08:45.927Z","""2""","wrote SASLHandshake v1"
"2022-09-29T01:08:45.927Z","""2""","read SASLHandshake v1"
"2022-09-29T01:08:45.927Z","""2""","beginning sasl authentication"
"2022-09-29T01:08:45.927Z","""2""","issuing SASLAuthenticate"
"2022-09-29T01:08:45.927Z","""2""","wrote SASLAuthenticate v2"
"2022-09-29T01:08:45.927Z","""2""","read SASLAuthenticate v2"
"2022-09-29T01:08:45.927Z","""2""","sasl has a limited lifetime"
"2022-09-29T01:08:45.927Z","""2""","connection initialized successfully"
"2022-09-29T01:08:45.927Z","""2""","wrote JoinGroup v7"
"2022-09-29T01:08:45.927Z","""3""","opening connection to broker"
"2022-09-29T01:08:45.927Z","""3""","connection opened to broker"
"2022-09-29T01:08:45.927Z","""3""","issuing SASLHandshakeRequest"
"2022-09-29T01:08:45.927Z","""3""","wrote SASLHandshake v1"
"2022-09-29T01:08:45.927Z","""2""","read JoinGroup v7"
"2022-09-29T01:08:45.927Z","""3""","read SASLHandshake v1"
"2022-09-29T01:08:45.927Z","""3""","beginning sasl authentication"
"2022-09-29T01:08:45.927Z","no broker field","joined"
"2022-09-29T01:08:45.927Z","no broker field","syncing"
"2022-09-29T01:08:45.927Z","""3""","issuing SASLAuthenticate"
"2022-09-29T01:08:45.927Z","""2""","wrote SyncGroup v5"
"2022-09-29T01:08:45.927Z","""3""","wrote SASLAuthenticate v2"
"2022-09-29T01:08:45.927Z","""3""","read SASLAuthenticate v2"
"2022-09-29T01:08:45.927Z","""3""","sasl has a limited lifetime"
"2022-09-29T01:08:45.927Z","""3""","connection initialized successfully"
"2022-09-29T01:08:45.927Z","""3""","wrote Metadata v9"
"2022-09-29T01:08:45.927Z","""3""","read Metadata v9"
"2022-09-29T01:08:45.927Z","no broker field","metadata refresh has identical topic partition data"
{
  InternalLib FranzGo
  leader 2
  leader_epoch 0
  level debug
  partition 0
  topic heartbeat.franz-go.sasl
}
"2022-09-29T01:08:45.927Z","""2""","read SyncGroup v5"
"2022-09-29T01:08:45.927Z","no broker field","synced"
{
  assigned
  group infra_kafka-health-monitor
  InternalLib FranzGo
  level info
}
"2022-09-29T01:08:45.927Z","no broker field","new group session begun"
"2022-09-29T01:08:45.927Z","no broker field","entering OnPartitionsAssigned"
"2022-09-29T01:08:45.928Z","no broker field","Partitions assigned"
"2022-09-29T01:08:45.928Z","no broker field","beginning heartbeat loop"
"2022-09-29T01:08:48.928Z","no broker field","autocommitting"
"2022-09-29T01:08:48.928Z","no broker field","heartbeating"
"2022-09-29T01:08:48.928Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:08:48.928Z","""2""","read Heartbeat v4"
"2022-09-29T01:08:48.928Z","no broker field","heartbeat complete"
"2022-09-29T01:08:48.928Z","no broker field","heartbeat errored"
"2022-09-29T01:08:48.928Z","no broker field","entering OnPartitionsLost"
"2022-09-29T01:08:48.928Z","no broker field","Partitions lost"
"2022-09-29T01:08:48.928Z","no broker field","Kill partitionReaders"
"2022-09-29T01:08:48.928Z","no broker field","assigning partitions"
"2022-09-29T01:08:48.928Z","no broker field","join and sync loop errored"
"2022-09-29T01:08:48.928Z","no broker field","immediate metadata update triggered"
"2022-09-29T01:08:48.928Z","""1""","opening connection to broker"
"2022-09-29T01:08:48.928Z","""1""","connection opened to broker"
"2022-09-29T01:08:48.928Z","""1""","issuing SASLHandshakeRequest"
"2022-09-29T01:08:48.928Z","""1""","wrote SASLHandshake v1"
"2022-09-29T01:08:48.928Z","""1""","read SASLHandshake v1"
"2022-09-29T01:08:48.928Z","""1""","beginning sasl authentication"
"2022-09-29T01:08:48.928Z","""1""","issuing SASLAuthenticate"
"2022-09-29T01:08:48.928Z","""1""","wrote SASLAuthenticate v2"
"2022-09-29T01:08:48.928Z","""1""","read SASLAuthenticate v2"
"2022-09-29T01:08:48.928Z","""1""","sasl has a limited lifetime"
"2022-09-29T01:08:48.928Z","""1""","connection initialized successfully"
"2022-09-29T01:08:48.928Z","""1""","wrote Metadata v9"
"2022-09-29T01:08:48.928Z","""1""","read Metadata v9"
"2022-09-29T01:08:48.928Z","no broker field","metadata refresh has identical topic partition data"
{
  InternalLib FranzGo
  leader 2
  leader_epoch 0
  level debug
  partition 0
  topic heartbeat.franz-go.sasl
}
"2022-09-29T01:08:49.928Z","no broker field","joining group"
"2022-09-29T01:08:49.929Z","""2""","wrote JoinGroup v7"
"2022-09-29T01:08:49.929Z","""2""","read JoinGroup v7"
"2022-09-29T01:08:49.929Z","no broker field","joined"
"2022-09-29T01:08:49.929Z","no broker field","syncing"
"2022-09-29T01:08:49.929Z","""2""","wrote SyncGroup v5"
"2022-09-29T01:08:49.929Z","""2""","read SyncGroup v5"
"2022-09-29T01:08:49.929Z","no broker field","synced"
"2022-09-29T01:08:49.929Z","no broker field","new group session begun"
"2022-09-29T01:08:49.929Z","no broker field","entering OnPartitionsAssigned"
"2022-09-29T01:08:49.929Z","no broker field","Partitions assigned"
"2022-09-29T01:08:49.929Z","no broker field","beginning heartbeat loop"
"2022-09-29T01:08:51.929Z","no broker field","autocommitting"
"2022-09-29T01:08:52.930Z","no broker field","heartbeating"
"2022-09-29T01:08:52.930Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:08:52.930Z","""2""","read Heartbeat v4"
"2022-09-29T01:08:52.930Z","no broker field","heartbeat complete"
"2022-09-29T01:08:52.930Z","no broker field","heartbeat errored"
"2022-09-29T01:08:52.930Z","no broker field","entering OnPartitionsLost"
"2022-09-29T01:08:52.930Z","no broker field","Partitions lost"
"2022-09-29T01:08:52.930Z","no broker field","Kill partitionReaders"
"2022-09-29T01:08:52.930Z","no broker field","assigning partitions"
"2022-09-29T01:08:52.930Z","no broker field","join and sync loop errored"
"2022-09-29T01:08:52.930Z","no broker field","immediate metadata update triggered"
"2022-09-29T01:08:52.930Z","""2""","wrote Metadata v9"
"2022-09-29T01:08:52.930Z","""2""","read Metadata v9"
"2022-09-29T01:08:52.930Z","no broker field","immediate metadata update had inner errors, re-updating"
{
  errors TOPIC_AUTHORIZATION_FAILED{heartbeat.franz-go.sasl}
  InternalLib FranzGo
  level debug
  update_after 250ms
}
"2022-09-29T01:08:52.930Z","""3""","wrote Metadata v9"
"2022-09-29T01:08:52.930Z","""3""","read Metadata v9"
"2022-09-29T01:08:52.930Z","no broker field","metadata refresh has identical topic partition data"
"2022-09-29T01:08:53.931Z","no broker field","joining group"
"2022-09-29T01:08:53.931Z","""2""","wrote JoinGroup v7"
"2022-09-29T01:08:53.931Z","""2""","read JoinGroup v7"
"2022-09-29T01:08:53.932Z","no broker field","joined"
"2022-09-29T01:08:53.932Z","no broker field","syncing"
"2022-09-29T01:08:53.933Z","""2""","wrote SyncGroup v5"
"2022-09-29T01:08:53.933Z","""2""","read SyncGroup v5"
"2022-09-29T01:08:53.933Z","no broker field","synced"
"2022-09-29T01:08:53.933Z","no broker field","new group session begun"
"2022-09-29T01:08:53.933Z","no broker field","entering OnPartitionsAssigned"
"2022-09-29T01:08:53.933Z","no broker field","Partitions assigned"
"2022-09-29T01:08:53.933Z","no broker field","beginning heartbeat loop"
"2022-09-29T01:08:54.933Z","no broker field","autocommitting"
"2022-09-29T01:08:56.934Z","no broker field","heartbeating"
  ...
"2022-09-29T01:11:37.140Z","no broker field","autocommitting"
"2022-09-29T01:11:39.141Z","no broker field","joining group"
"2022-09-29T01:11:39.141Z","""2""","wrote JoinGroup v7"
"2022-09-29T01:11:39.141Z","""2""","read JoinGroup v7"
"2022-09-29T01:11:39.141Z","no broker field","joined"
"2022-09-29T01:11:39.141Z","no broker field","syncing"
"2022-09-29T01:11:39.141Z","""2""","wrote SyncGroup v5"
"2022-09-29T01:11:39.141Z","""2""","read SyncGroup v5"
"2022-09-29T01:11:39.141Z","no broker field","synced"
"2022-09-29T01:11:39.141Z","no broker field","new group session begun"
"2022-09-29T01:11:39.141Z","no broker field","entering OnPartitionsAssigned"
"2022-09-29T01:11:39.141Z","no broker field","Partitions assigned"
"2022-09-29T01:11:39.141Z","no broker field","beginning heartbeat loop"
"2022-09-29T01:11:40.142Z","no broker field","autocommitting"
"2022-09-29T01:11:42.143Z","no broker field","heartbeating"
"2022-09-29T01:11:42.143Z","""2""","sasl expiry limit reached, reauthenticating"
"2022-09-29T01:11:42.143Z","""2""","issuing SASLHandshakeRequest"
"2022-09-29T01:11:42.143Z","""2""","wrote SASLHandshake v1"
"2022-09-29T01:11:42.143Z","""2""","read SASLHandshake v1"
"2022-09-29T01:11:42.143Z","""2""","beginning sasl authentication"
"2022-09-29T01:11:42.143Z","""2""","issuing SASLAuthenticate"
"2022-09-29T01:11:42.143Z","""2""","wrote SASLAuthenticate v2"
"2022-09-29T01:11:42.143Z","""2""","read SASLAuthenticate v2"
"2022-09-29T01:11:42.143Z","""2""","sasl has a limited lifetime"
"2022-09-29T01:11:42.143Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:11:42.143Z","""2""","read Heartbeat v4"
"2022-09-29T01:11:42.143Z","no broker field","heartbeat complete"
"2022-09-29T01:11:43.144Z","no broker field","autocommitting"
"2022-09-29T01:11:45.144Z","no broker field","heartbeating"
"2022-09-29T01:11:45.144Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:11:45.144Z","""2""","read Heartbeat v4"
"2022-09-29T01:11:45.144Z","no broker field","heartbeat complete"
"2022-09-29T01:11:46.144Z","no broker field","autocommitting"
"2022-09-29T01:11:48.146Z","no broker field","heartbeating"
"2022-09-29T01:11:48.146Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:11:48.146Z","""2""","read Heartbeat v4"
"2022-09-29T01:11:48.146Z","no broker field","heartbeat complete"
"2022-09-29T01:11:49.147Z","no broker field","autocommitting"
"2022-09-29T01:11:51.160Z","no broker field","heartbeating"
"2022-09-29T01:11:51.160Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:11:51.160Z","""2""","read Heartbeat v4"
"2022-09-29T01:11:51.160Z","no broker field","heartbeat complete"
"2022-09-29T01:11:52.160Z","no broker field","autocommitting"
"2022-09-29T01:11:54.161Z","no broker field","heartbeating"
"2022-09-29T01:11:54.161Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:11:54.161Z","""2""","read Heartbeat v4"
"2022-09-29T01:11:54.161Z","no broker field","heartbeat complete"
"2022-09-29T01:11:55.161Z","no broker field","autocommitting"
"2022-09-29T01:11:57.168Z","no broker field","heartbeating"
"2022-09-29T01:11:57.168Z","""2""","wrote Heartbeat v4"
"2022-09-29T01:11:57.192Z","""2""","read Heartbeat v4"
"2022-09-29T01:11:57.192Z","no broker field","heartbeat complete"
"2022-09-29T01:11:58.168Z","no broker field","autocommitting"

As you can see it is kind of rudderless until sasl expiry limit reached, reauthenticating. This error message never appears for broker 1 or 3. It appears for broker 2 over the course of 2 days 6 times... 4 times in the host that reads and 2 times in the idle host

twmb · 2022-10-05T18:31:41Z

Coming back to this -- IMO this looks like an AWS problem. It sounds like the clock on the affected broker may be a bit screwy or something and credentials are timing out sooner than they should. I'm not sure what can be done within the client to work around this. The client already works around some other problems of AWS (re-authenticating within 1s if AWS gives a 1s expiry...). Per KIP-368, if a client continues to use a connection after SASL has expired, the broker should close the connection ("and for brokers to close connections that continue to use expired sessions").

The client doesn't close connections for failed auth because usually this implies ACL problems, and closing and reopening the connection doesn't improve anything (and a user can add ACLs on the fly and the client will begin authenticating successfully).

Is it possible to get some support from Amazon on this issue?

ekeric13 · 2022-10-05T18:50:26Z

Yeah let me reach out to them.

edit: Still an ongoing discussion.

They have said: "These errors generally occur because of insufficient permissions/actions given in authorization policy attached to IAM role which is used to authorize while interacting with MSK cluster."

But I pointed out that I have tested on very broad permissions and still see the errors

  statement {
    effect = "Allow"
    actions = [
      "kafka-cluster:*"
    ]
    resources = [
      "*"
    ]
  }

Waiting to hear back from that point raised.

twmb · 2022-10-18T23:07:34Z

fwiw, I think that others have used AWS_MSK_IAM without running into this issue?

twmb · 2022-10-19T16:54:30Z

Hey there, for some reason I'm not seeing your reply on the issue (only in my email), so I'll reply through email as well: Are you saying that you did not change franz-go at all, did not bump the version, but something in the cluster itself changed such that now you are seeing the same issue in this thread?

…

On Wed, Oct 19, 2022 at 7:17 AM _ksco ***@***.***> wrote: No, I also encountered this problem, but no new issues were released. In fact, when I used this library in May this year, I did not observe this error for normal production and consumption, but the project architecture changed and put kafka as idle, without any production and consumption of messages, and it will appear [29] Topic Authorization Failed: the client is not authorized to access the requested topic and [3] Unknown Topic Or Partition: the request is for a topic or partition that does not exist on this broker — Reply to this email directly, view it on GitHub <#205 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAJZIJJ24YXPGGZTSRZLLN3WD7YFRANCNFSM6AAAAAAQSQL5RM> . You are receiving this because you commented.Message ID: ***@***.***>

ekeric13 · 2022-10-21T20:13:49Z

Still working with AWS. They haven't been much help.

why do you think I am only seeing this on my kafka readers but not my kafka writers (Which still use sasl msk_iam auth)?

Also it is weird that the error itself is an authorization error... but then just gets resolved without me changing the IAM settings. How could this happen? Why is one minute a topic not authorized for access but the next minute it is?
I ask because maybe MSK broker isn't sending back an auth error and franz-go is just assuming it is one? Like you mentioned if it was a non-auth error you might close the connection yourself, but you aren't doing that because it is not like auth issues can be solved by turning it on and off again without changes to the underlining auth settings. But if it is not an auth error and the guess is incorrect then it might make sense redo the connection.

I see the errors are defined by you:

franz-go/pkg/kerr/kerr.go

Line 106 in eb2e62d

    
           GroupAuthorizationFailed           = &Error{"GROUP_AUTHORIZATION_FAILED", 30, false, "Not authorized to access group: Group authorization failed."}

Is the error just a best guess or are you getting the erroCodeMap from aws docs?

I looked over the aws auth part of the code and I don't see anything that could be going wrong:
https://github.com/twmb/franz-go/blob/v1.9.0/pkg/sasl/aws/aws.go#L111-L163

Maybe there is an issue in determining how long we should wait for re-auth?

franz-go/pkg/kgo/broker.go

Lines 785 to 912 in eb2e62d

    
           func (cxn *brokerCxn) doSasl(authenticate bool) error { 
        
           	session, clientWrite, err := cxn.mechanism.Authenticate(cxn.cl.ctx, cxn.addr) 
        
           	if err != nil { 
        
           		return err 
        
           	} 
        
           	if len(clientWrite) == 0 { 
        
           		return fmt.Errorf("unexpected server-write sasl with mechanism %s", cxn.mechanism.Name()) 
        
           	} 
        
           	prereq := time.Now() // used below for sasl lifetime calculation 
        
           	var lifetimeMillis int64 
        
           	// Even if we do not wrap our reads/writes in SASLAuthenticate, we 
        
           	// still use the SASLAuthenticate timeouts. 
        
           	rt, wt := cxn.cl.connTimeouter.timeouts(kmsg.NewPtrSASLAuthenticateRequest()) 
        
           	// We continue writing until both the challenging is done AND the 
        
           	// responses are done. We can have an additional response once we 
        
           	// are done with challenges. 
        
           	step := -1 
        
           	for done := false; !done || len(clientWrite) > 0; { 
        
           		step++ 
        
           		var challenge []byte 
        
           		if !authenticate { 
        
           			buf := cxn.cl.bufPool.get() 
        
           			buf = append(buf[:0], 0, 0, 0, 0) 
        
           			binary.BigEndian.PutUint32(buf, uint32(len(clientWrite))) 
        
           			buf = append(buf, clientWrite...) 
        
           			cxn.cl.cfg.logger.Log(LogLevelDebug, "issuing raw sasl authenticate", "broker", logID(cxn.b.meta.NodeID), "step", step) 
        
           			_, _, _, _, err = cxn.writeConn(context.Background(), buf, wt, time.Now()) 
        
           			cxn.cl.bufPool.put(buf) 
        
           			if err != nil { 
        
           				return err 
        
           			} 
        
           			if !done { 
        
           				if _, challenge, _, _, err = cxn.readConn(context.Background(), rt, time.Now()); err != nil { 
        
           					return err 
        
           				} 
        
           			} 
        
           		} else { 
        
           			req := kmsg.NewPtrSASLAuthenticateRequest() 
        
           			req.SASLAuthBytes = clientWrite 
        
           			req.Version = cxn.b.loadVersions().versions[req.Key()] 
        
           			cxn.cl.cfg.logger.Log(LogLevelDebug, "issuing SASLAuthenticate", "broker", logID(cxn.b.meta.NodeID), "version", req.Version, "step", step) 
        
           			// Lifetime: we take the timestamp before we write our 
        
           			// request; see usage below for why. 
        
           			prereq = time.Now() 
        
           			corrID, bytesWritten, writeWait, timeToWrite, readEnqueue, writeErr := cxn.writeRequest(nil, time.Now(), req) 
        
           			// As mentioned above, we could have one final write 
        
           			// without reading a response back (kerberos). If this 
        
           			// is the case, we need to e2e. 
        
           			if writeErr != nil || done { 
        
           				cxn.hookWriteE2E(req.Key(), bytesWritten, writeWait, timeToWrite, writeErr) 
        
           				if writeErr != nil { 
        
           					return writeErr 
        
           				} 
        
           			} 
        
           			if !done { 
        
           				rawResp, err := cxn.readResponse(nil, req.Key(), req.GetVersion(), corrID, req.IsFlexible(), rt, bytesWritten, writeWait, timeToWrite, readEnqueue) 
        
           				if err != nil { 
        
           					return err 
        
           				} 
        
           				resp := req.ResponseKind().(*kmsg.SASLAuthenticateResponse) 
        
           				if err = resp.ReadFrom(rawResp); err != nil { 
        
           					return err 
        
           				} 
        
           				if err = kerr.ErrorForCode(resp.ErrorCode); err != nil { 
        
           					if resp.ErrorMessage != nil { 
        
           						return fmt.Errorf("%s: %w", *resp.ErrorMessage, err) 
        
           					} 
        
           					return err 
        
           				} 
        
           				challenge = resp.SASLAuthBytes 
        
           				lifetimeMillis = resp.SessionLifetimeMillis 
        
           			} 
        
           		} 
        
           		clientWrite = nil 
        
           		if !done { 
        
           			if done, clientWrite, err = session.Challenge(challenge); err != nil { 
        
           				return err 
        
           			} 
        
           		} 
        
           	} 
        
           	if lifetimeMillis > 0 { 
        
           		// Lifetime: we could have written our request instantaenously, 
        
           		// the broker calculating our session lifetime, and then the 
        
           		// broker / network hung for a bit when writing. We 
        
           		// pessimistically assume this worst case and take off the 
        
           		// final request e2e latency x1.1 from the lifetime. 
        
           		// 
        
           		// If the latency is <2.5s, we also pessimistically assume that 
        
           		// things may take 2.5s in the future. 
        
           		// 
        
           		// We may make our lifetime <0; brokers should use longer 
        
           		// lifetimes, but some do not in all cases. If our lifetime is 
        
           		// <100ms, we sleep for 100ms just to ensure we do not 
        
           		// spin-loop reauthenticating *too* much. 
        
           		latency := int64(float64(time.Since(prereq).Milliseconds()) * 1.1) 
        
           		if latency < 2500 { 
        
           			latency = 2500 
        
           		} 
        
           		useLifetime := lifetimeMillis - latency 
        
           		now := time.Now() 
        
           		cxn.expiry = now.Add(time.Duration(useLifetime) * time.Millisecond) 
        
           		cxn.cl.cfg.logger.Log(LogLevelDebug, "sasl has a limited lifetime", "broker", logID(cxn.b.meta.NodeID), "reauthenticate_in", cxn.expiry.Sub(now)) 
        
           		if useLifetime < 0 { 
        
           			cxn.cl.cfg.logger.Log(LogLevelInfo, "sasl lifetime minus 2.5s lower bound latency results in immediate reauthentication, sleeping 100ms to avoid spin-loop", 
        
           				"broker", logID(cxn.b.meta.NodeID), 
        
           				"session_lifetime", time.Duration(lifetimeMillis)*time.Millisecond, 
        
           				"latency_lower_bound", time.Duration(latency)*time.Millisecond, 
        
           			) 
        
           			time.Sleep(100 * time.Millisecond) 
        
           		} 
        
           	} 
        
           	return nil 
        
           }

Would it make sense to, if getting an authorization error, just try to do a sasl re-auth just to be safe?
Or maybe have a user input that lets them ask the client to re-auth sooner than the broker specified? Like even if the broker says it will expire in 11 hours, re-auth in 6 hours.

The fact that the errors only appear for a few minutes, Never like 30 minutes, make it seem like a clock issue such as sasl expiry and then not re-authenticating as opposed to auth.
I am trying to figure out if the clock is on them but maybe there is a bug??

twmb · 2022-10-23T17:37:35Z

kafka readers but not kafka writers

Are the writers constantly writing? Or, are the writes infrequent? Group consumers by default heartbeat every 3s, and fetches are long polling, so both of these will hit authorization errors relatively immediately.

If your writes are constant, this is even more of a mystery because the auth is the same and there should be absolutely no difference (and imo points even more to an AWS problem).

resolved without me changing the IAM settings

From the logs above, it looks like the errors happen relatively close to when a token is supposed to expire. As you noted, the errors occur until the sasl expiry, at which point the client chooses to re-authenticate. This loads fresh credentials, re-authenticates, and then things start working again.

I see the errors are defined by you:

These are not defined by me, these are Kafka protocol errors: https://kafka.apache.org/protocol#protocol_error_codes

issue in determining how long we should wait for re-auth?

The client is already pessimistic in calculating time. Before #136 was addressed, if the broker said the sasl lifetimes was 5hr, the client would reauthenticate 4h59m59s. After #136, we now require 1.1x sasl latency lowerbound, or 2.5s, whichever is larger. In your logs above, the sasl flow is nearly instantaneous, so I expect that reauthentication is always ~11h59m57.5s after the initial sasl flow (which we can also see above).

Would it make sense to, if getting an authorization error, just try to do a sasl re-auth just to be safe?

This is an option but it's ugly internally and would require a lot of wiring, and it's not really a good solution for the normal case of a person actually truly not having perms: closing and reopening connections only to continue rejecting requests is harder on brokers than just rejecting requests (and could lead to socket exhaustion)

Like even if the broker says it will expire in 11 hours, re-auth in 6 hours.

Yes, the client could be even more pessimistic. Six hours is way pessimistic, I'd probably err on 15 to 20 minutes earlier, max.

FWIW, the Java client itself is also very pessimistic; it reauths after a random percentage of the lifetime ms between 85% and 95%: https://github.com/apache/kafka/blob/trunk/clients/src/main/java/org/apache/kafka/common/security/authenticator/SaslClientAuthenticator.java#L687-L692. This is probably why Java clients do not run into the problem.

I'll add a more pessimistic reauth calculation (edited out pseudocode, see real code)

twmb · 2022-10-23T19:49:42Z

Can you try this commit: eb6e3b5 in the updates branch? If this works, I'll release a bugfix (and potentially throw in #214 or #222, though both of those are more complicated.

ekeric13 · 2022-10-24T19:50:22Z

Are the writers constantly writing? Or, are the writes infrequent?

The writers write every 5 seconds. There are 2 writers since I have 2 replicas running. So there is a write on average every 2.5 seconds.

These are not defined by me

Okay yeah the fact that msk gives me unathorized error seems to indicate the brokers prematurely expire.

I'd probably err on 15 to 20 minutes earlier, max

I don't think i have seen it error more than 10 minutes. Generally have errors for like 3-5 minutes before the sasl re-auth happens

it reauths after a random percentage of the lifetime ms between 85% and 95%:

wow okay

I'll add a more pessimistic reauth calculation

Yeah i think it makes sense to match what the java client is doing.

twmb · 2022-10-25T21:47:40Z

@ekeric13 are you able to try the updates branch to see if that fixes your issue?

ekeric13 · 2022-10-26T21:03:54Z

@twmb yeah I deployed it the other day (soon after I responded to you) and I haven't seen any errors yet! Was waiting a bit longer to call it "done" though since in the past it would randomly go a while without errors as well. But so far so good.

twmb · 2022-10-26T21:51:30Z

Awesome, good to hear. I'll wait another day or two and assume that no response (or good response) means it's good, and a reply that it's bad indicates things are still bugged

ekeric13 · 2022-10-28T23:51:51Z

Yeah i think it is fair to say the change is good

twmb · 2022-10-31T18:16:56Z

This is released in v1.9.1

twmb mentioned this issue Sep 27, 2022

1.8 release status #210

Closed

8 tasks

twmb added blocked Unable to move forward without more details waiting labels Oct 6, 2022

twmb mentioned this issue Oct 23, 2022

AWS_MSK_IAM: kept getting GROUP_AUTHORIZATION_FAILED after several hours of running #147

Closed

twmb removed the blocked Unable to move forward without more details label Oct 24, 2022

twmb closed this as completed in eb6e3b5 Oct 31, 2022

twmb mentioned this issue Oct 31, 2022

Stuck at join and sync loop error #151

Closed

twmb mentioned this issue Nov 11, 2022

ILLEGAL_SASL_STATE: Request is not valid given the current SASL state #249

Closed

twmb removed the waiting label May 26, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Kafka read error with SASL auth: TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.] + GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed #205

Kafka read error with SASL auth: TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.] + GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed #205

ekeric13 commented Sep 21, 2022 •

edited

Loading

twmb commented Sep 22, 2022

ekeric13 commented Sep 23, 2022 •

edited

Loading

twmb commented Sep 26, 2022

ekeric13 commented Sep 27, 2022 •

edited

Loading

ekeric13 commented Sep 27, 2022

twmb commented Sep 29, 2022

ekeric13 commented Sep 30, 2022 •

edited

Loading

ekeric13 commented Sep 30, 2022

twmb commented Oct 5, 2022

ekeric13 commented Oct 5, 2022 •

edited

Loading

twmb commented Oct 18, 2022

twmb commented Oct 19, 2022 via email

ekeric13 commented Oct 21, 2022 •

edited

Loading

twmb commented Oct 23, 2022 •

edited

Loading

twmb commented Oct 23, 2022

ekeric13 commented Oct 24, 2022 •

edited

Loading

twmb commented Oct 25, 2022

ekeric13 commented Oct 26, 2022

twmb commented Oct 26, 2022

ekeric13 commented Oct 28, 2022

twmb commented Oct 31, 2022

Kafka read error with SASL auth: TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.] + GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed #205

Kafka read error with SASL auth: TOPIC_AUTHORIZATION_FAILED: Not authorized to access topics: [Topic authorization failed.] + GROUP_AUTHORIZATION_FAILED: Not authorized to access group: Group authorization failed #205

Comments

ekeric13 commented Sep 21, 2022 • edited Loading

twmb commented Sep 22, 2022

ekeric13 commented Sep 23, 2022 • edited Loading

twmb commented Sep 26, 2022

ekeric13 commented Sep 27, 2022 • edited Loading

ekeric13 commented Sep 27, 2022

twmb commented Sep 29, 2022

ekeric13 commented Sep 30, 2022 • edited Loading

ekeric13 commented Sep 30, 2022

twmb commented Oct 5, 2022

ekeric13 commented Oct 5, 2022 • edited Loading

twmb commented Oct 18, 2022

twmb commented Oct 19, 2022 via email

ekeric13 commented Oct 21, 2022 • edited Loading

twmb commented Oct 23, 2022 • edited Loading

twmb commented Oct 23, 2022

ekeric13 commented Oct 24, 2022 • edited Loading

twmb commented Oct 25, 2022

ekeric13 commented Oct 26, 2022

twmb commented Oct 26, 2022

ekeric13 commented Oct 28, 2022

twmb commented Oct 31, 2022

ekeric13 commented Sep 21, 2022 •

edited

Loading

ekeric13 commented Sep 23, 2022 •

edited

Loading

ekeric13 commented Sep 27, 2022 •

edited

Loading

ekeric13 commented Sep 30, 2022 •

edited

Loading

ekeric13 commented Oct 5, 2022 •

edited

Loading

ekeric13 commented Oct 21, 2022 •

edited

Loading

twmb commented Oct 23, 2022 •

edited

Loading

ekeric13 commented Oct 24, 2022 •

edited

Loading