From 83dd28ebb9e6378da94f3bcb28eee3cdf40611be Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Mon, 4 Nov 2019 20:02:44 +0000 Subject: [PATCH] feat(swarm): improve dial context 1. Always return the caller's context error if relevant. 2. Don't return "context canceled" when we're just shutting down. 3. Don't claim that the context deadline has been exceeded when the dial timeout is canceled. --- dial_sync.go | 14 ++++++++++++++ swarm.go | 3 +++ swarm_dial.go | 17 ++++++++++++++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/dial_sync.go b/dial_sync.go index f746b9a..4c1230f 100644 --- a/dial_sync.go +++ b/dial_sync.go @@ -2,11 +2,15 @@ package swarm import ( "context" + "errors" "sync" "github.com/libp2p/go-libp2p-core/peer" ) +// TODO: change this text when we fix the bug +var errDialCanceled = errors.New("dial was aborted internally, likely due to https://git.io/Je2wW") + // DialFunc is the type of function expected by DialSync. type DialFunc func(context.Context, peer.ID) (*Conn, error) @@ -78,6 +82,16 @@ func (ad *activeDial) decref() { func (ad *activeDial) start(ctx context.Context) { ad.conn, ad.err = ad.ds.dialFunc(ctx, ad.id) + + // This isn't the user's context so we should fix the error. + switch ad.err { + case context.Canceled: + // The dial was canceled with `CancelDial`. + ad.err = errDialCanceled + case context.DeadlineExceeded: + // We hit an internal timeout, not a context timeout. + ad.err = ErrDialTimeout + } close(ad.waitch) ad.cancel() } diff --git a/swarm.go b/swarm.go index 5366bdc..f7c5771 100644 --- a/swarm.go +++ b/swarm.go @@ -40,6 +40,9 @@ var ErrSwarmClosed = errors.New("swarm closed") // transport is misbehaving. var ErrAddrFiltered = errors.New("address filtered") +// ErrDialTimeout is returned when one a dial times out due to the global timeout +var ErrDialTimeout = errors.New("dial timed out") + // Swarm is a connection muxer, allowing connections to other peers to // be opened and closed, while still using the same Chan for all // communication. The Chan sends/receives Messages, which note the diff --git a/swarm_dial.go b/swarm_dial.go index ffb7ab5..475a7e2 100644 --- a/swarm_dial.go +++ b/swarm_dial.go @@ -221,12 +221,23 @@ func (s *Swarm) dialPeer(ctx context.Context, p peer.ID) (*Conn, error) { defer cancel() conn, err = s.dsync.DialLock(ctx, p) - if err != nil { - return nil, err + if err == nil { + return conn, nil } log.Debugf("network for %s finished dialing %s", s.local, p) - return conn, err + + if ctx.Err() != nil { + // Context error trumps any dial errors as it was likely the ultimate cause. + return nil, ctx.Err() + } + + if s.ctx.Err() != nil { + // Ok, so the swarm is shutting down. + return nil, ErrSwarmClosed + } + + return nil, err } // doDial is an ugly shim method to retain all the logging and backoff logic -- GitLab