Skip to content

Commit

Permalink
transport: fix logical race in flow control
Browse files Browse the repository at this point in the history
Remove the add and cancel methods of quotaPool. Their use is not required, and
leads to logical races when used concurrently from multiple goroutines. Rename
the reset method to add.

The typical way that a goroutine claims quota is to call the add method and
then to select on the channel returned by the acquire method. If two
goroutines are both trying to claim quota from a single quotaPool, the second
call to the add method can happen before the first attempt to read from the
channel. When that happens the second goroutine to attempt the channel read
will end up waiting for a very long time, in spite of its efforts to prepare
the channel for reading.

The quotaPool will always behave correctly when any positive quota is on the
channel rather than stored in the struct field. In the opposite case, when
positive quota is only in the struct field and not on the channel, users of
the quotaPool can fail to access available quota. Err on the side of storing
any positive quota in the channel.

This includes a reproducer for #632, which fails on many runs with this
package at v1.0.4. The frequency of the test failures depends on how stressed
the server is, since it's now effectively checking for weird interleavings of
goroutines. It passes reliably with these changes to the transport package.

The behavior described in #734 (an RPC with a streaming response hangs
unexpectedly) matches what I've seen in my programs, and what I see in the
test case added here. If it's a logical flow control bug, this change may well
fix it.

Updates #632
Updates #734
  • Loading branch information
rhysh committed Nov 30, 2016
1 parent cc3363f commit e1dc5ab
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 38 deletions.
115 changes: 115 additions & 0 deletions test/end2end_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3188,6 +3188,120 @@ func TestServerCredsDispatch(t *testing.T) {
}
}

func TestFlowControlIssue632(t *testing.T) {
// Test for a regression of https://github.com/grpc/grpc-go/issues/632,
// and other flow control bugs.

defer leakCheck(t)()

const (
itemCount = 100
itemSize = 1 << 10
recvCount = 2
maxFailures = 3

requestCount = 10000
totalTimeout = 30 * time.Second
requestTimeout = time.Second
)

lis, err := net.Listen("tcp", ":0")
if err != nil {
t.Fatalf("Failed to listen: %v", err)
}
defer lis.Close()

s := grpc.NewServer()
testpb.RegisterTestServiceServer(s, &issue632server{
itemCount: itemCount,
itemSize: itemSize,
})
defer s.Stop()

go s.Serve(lis)

ctx := context.Background()
ctx, _ = context.WithTimeout(ctx, totalTimeout)
defer func() {
if ctx.Err() == context.DeadlineExceeded {
t.Fatalf("test timed out")
}
}()

cc, err := grpc.Dial(lis.Addr().String(), grpc.WithInsecure())
if err != nil {
t.Fatalf("grpc.Dial(%q) = %v", lis.Addr().String(), err)
}
defer cc.Close()
cl := testpb.NewTestServiceClient(cc)

failures := 0
for i := 0; i < requestCount; i++ {
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
output, err := cl.StreamingOutputCall(ctx, &testpb.StreamingOutputCallRequest{})
if err != nil {
t.Fatalf("StreamingOutputCall; err = %q", err)
}

j := 0
loop:
for ; j < recvCount; j++ {
_, err := output.Recv()
if err != nil {
if err == io.EOF {
break loop
}
switch grpc.Code(err) {
case codes.DeadlineExceeded:
break loop
default:
t.Fatalf("Recv; err = %q", err)
}
}
}
cancel()
<-ctx.Done()

if j < recvCount {
t.Errorf("got %d responses to request %d", j, i)
failures++
if failures >= maxFailures {
// Continue past the first failure to see if the connection is
// entirely broken, or if only a single RPC was affected
break
}
}
}
}

type issue632server struct {
testpb.TestServiceServer

itemSize int
itemCount int
}

func (s *issue632server) StreamingOutputCall(req *testpb.StreamingOutputCallRequest, srv testpb.TestService_StreamingOutputCallServer) error {
for i := 0; i < s.itemCount; i++ {
err := srv.Send(&testpb.StreamingOutputCallResponse{
Payload: &testpb.Payload{
// Sending a large stream of data which the client reject
// helps to trigger some types of flow control bugs.
//
// Reallocating memory here is inefficient, but the stress it
// puts on the GC leads to more frequent flow control
// failures. The GC likely causes more variety in the
// goroutine scheduling orders.
Body: bytes.Repeat([]byte("a"), s.itemSize),
},
})
if err != nil {
return err
}
}
return nil
}

// interestingGoroutines returns all goroutines we care about for the purpose
// of leak checking. It excludes testing or runtime ones.
func interestingGoroutines() (gs []string) {
Expand All @@ -3208,6 +3322,7 @@ func interestingGoroutines() (gs []string) {
strings.Contains(stack, "testing.tRunner(") ||
strings.Contains(stack, "runtime.goexit") ||
strings.Contains(stack, "created by runtime.gc") ||
strings.Contains(stack, "created by runtime/trace.Start") ||
strings.Contains(stack, "created by google3/base/go/log.init") ||
strings.Contains(stack, "interestingGoroutines") ||
strings.Contains(stack, "runtime.MHeap_Scavenger") ||
Expand Down
30 changes: 2 additions & 28 deletions transport/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,35 +111,9 @@ func newQuotaPool(q int) *quotaPool {
return qb
}

// add adds n to the available quota and tries to send it on acquire.
func (qb *quotaPool) add(n int) {
qb.mu.Lock()
defer qb.mu.Unlock()
qb.quota += n
if qb.quota <= 0 {
return
}
select {
case qb.c <- qb.quota:
qb.quota = 0
default:
}
}

// cancel cancels the pending quota sent on acquire, if any.
func (qb *quotaPool) cancel() {
qb.mu.Lock()
defer qb.mu.Unlock()
select {
case n := <-qb.c:
qb.quota += n
default:
}
}

// reset cancels the pending quota sent on acquired, incremented by v and sends
// add cancels the pending quota sent on acquired, incremented by v and sends
// it back on acquire.
func (qb *quotaPool) reset(v int) {
func (qb *quotaPool) add(v int) {
qb.mu.Lock()
defer qb.mu.Unlock()
select {
Expand Down
9 changes: 3 additions & 6 deletions transport/http2_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ func (t *http2Client) NewStream(ctx context.Context, callHdr *CallHdr) (_ *Strea
}
t.mu.Unlock()
if reset {
t.streamsQuota.reset(-1)
t.streamsQuota.add(-1)
}

// HPACK encodes various headers. Note that once WriteField(...) is
Expand Down Expand Up @@ -614,9 +614,6 @@ func (t *http2Client) Write(s *Stream, data []byte, opts *Options) error {
// Wait until the transport has some quota to send the data.
tq, err := wait(s.ctx, s.done, s.goAway, t.shutdownChan, t.sendQuotaPool.acquire())
if err != nil {
if _, ok := err.(StreamError); ok || err == io.EOF {
t.sendQuotaPool.cancel()
}
return err
}
if sq < size {
Expand Down Expand Up @@ -1035,13 +1032,13 @@ func (t *http2Client) applySettings(ss []http2.Setting) {
t.maxStreams = int(s.Val)
t.mu.Unlock()
if reset {
t.streamsQuota.reset(int(s.Val) - ms)
t.streamsQuota.add(int(s.Val) - ms)
}
case http2.SettingInitialWindowSize:
t.mu.Lock()
for _, stream := range t.activeStreams {
// Adjust the sending quota for each stream.
stream.sendQuotaPool.reset(int(s.Val - t.streamSendQuota))
stream.sendQuotaPool.add(int(s.Val - t.streamSendQuota))
}
t.streamSendQuota = s.Val
t.mu.Unlock()
Expand Down
5 changes: 1 addition & 4 deletions transport/http2_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,6 @@ func (t *http2Server) Write(s *Stream, data []byte, opts *Options) error {
// Wait until the transport has some quota to send the data.
tq, err := wait(s.ctx, nil, nil, t.shutdownChan, t.sendQuotaPool.acquire())
if err != nil {
if _, ok := err.(StreamError); ok {
t.sendQuotaPool.cancel()
}
return err
}
if sq < size {
Expand Down Expand Up @@ -706,7 +703,7 @@ func (t *http2Server) applySettings(ss []http2.Setting) {
t.mu.Lock()
defer t.mu.Unlock()
for _, stream := range t.activeStreams {
stream.sendQuotaPool.reset(int(s.Val - t.streamSendQuota))
stream.sendQuotaPool.add(int(s.Val - t.streamSendQuota))
}
t.streamSendQuota = s.Val
}
Expand Down

0 comments on commit e1dc5ab

Please sign in to comment.