diff --git a/dial_error.go b/dial_error.go new file mode 100644 index 0000000000000000000000000000000000000000..6d5bb149294e733a802f2847abef80403ed10c7f --- /dev/null +++ b/dial_error.go @@ -0,0 +1,69 @@ +package swarm + +import ( + "fmt" + "strings" + + peer "github.com/libp2p/go-libp2p-peer" + ma "github.com/multiformats/go-multiaddr" +) + +// maxDialDialErrors is the maximum number of dial errors we record +const maxDialDialErrors = 16 + +// DialError is the error type returned when dialing. +type DialError struct { + Peer peer.ID + DialErrors []TransportError + Cause error + Skipped int +} + +func (e *DialError) recordErr(addr ma.Multiaddr, err error) { + if len(e.DialErrors) >= maxDialDialErrors { + e.Skipped++ + return + } + e.DialErrors = append(e.DialErrors, TransportError{ + Address: addr, + Cause: err, + }) +} + +func (e *DialError) Error() string { + var builder strings.Builder + fmt.Fprintf(&builder, "failed to dial %s:", e.Peer) + if e.Cause != nil { + fmt.Fprintf(&builder, " %s", e.Cause) + } + for _, te := range e.DialErrors { + fmt.Fprintf(&builder, "\n * [%s] %s", te.Address, te.Cause) + } + if e.Skipped > 0 { + fmt.Fprintf(&builder, "\n ... skipping %d errors ...", e.Skipped) + } + return builder.String() +} + +// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper. +func (e *DialError) Unwrap() error { + // If we have a context error, that's the "ultimate" error. + if e.Cause != nil { + return e.Cause + } + return nil +} + +var _ error = (*DialError)(nil) + +// TransportError is the error returned when dialing a specific address. +type TransportError struct { + Address ma.Multiaddr + Cause error +} + +func (e *TransportError) Error() string { + return fmt.Sprintf("failed to dial %s: %s", e.Address, e.Cause) +} + +var _ error = (*TransportError)(nil) diff --git a/dial_test.go b/dial_test.go index adb3438ac46d58eb2a4957085f21b35817c86658..99585d2ce7a8cdad9aeef312ced536fac4c47619 100644 --- a/dial_test.go +++ b/dial_test.go @@ -2,9 +2,7 @@ package swarm_test import ( "context" - "fmt" "net" - "regexp" "sync" "testing" "time" @@ -491,8 +489,8 @@ func TestDialPeerFailed(t *testing.T) { defer closeSwarms(swarms) testedSwarm, targetSwarm := swarms[0], swarms[1] - exceptedErrorsCount := 5 - for i := 0; i < exceptedErrorsCount; i++ { + expectedErrorsCount := 5 + for i := 0; i < expectedErrorsCount; i++ { _, silentPeerAddress, silentPeerListener := newSilentPeer(t) go acceptAndHang(silentPeerListener) defer silentPeerListener.Close() @@ -508,23 +506,17 @@ func TestDialPeerFailed(t *testing.T) { t.Fatal(err) } - // dial_test.go:508: correctly get a combined error: dial attempt failed: 10 errors occurred: - // * --> (/ip4/127.0.0.1/tcp/46485) dial attempt failed: failed to negotiate security protocol: context deadline exceeded - // * --> (/ip4/127.0.0.1/tcp/34881) dial attempt failed: failed to negotiate security protocol: context deadline exceeded + // dial_test.go:508: correctly get a combined error: failed to dial PEER: all dials failed + // * [/ip4/127.0.0.1/tcp/46485] failed to negotiate security protocol: context deadline exceeded + // * [/ip4/127.0.0.1/tcp/34881] failed to negotiate security protocol: context deadline exceeded // ... - errorCountRegexpString := fmt.Sprintf("%d errors occurred", exceptedErrorsCount) - errorCountRegexp := regexp.MustCompile(errorCountRegexpString) - if !errorCountRegexp.MatchString(err.Error()) { - t.Fatalf("can't find total err count: `%s' in `%s'", errorCountRegexpString, err.Error()) + dialErr, ok := err.(*DialError) + if !ok { + t.Fatalf("expected *DialError, got %T", err) } - connectErrorsRegexpString := `\* --> \(.+?\) dial attempt failed:.+` - connectErrorsRegexp := regexp.MustCompile(connectErrorsRegexpString) - connectErrors := connectErrorsRegexp.FindAll([]byte(err.Error()), -1) - if len(connectErrors) != exceptedErrorsCount { - t.Fatalf("connectErrors must contain %d errros; "+ - "but `%s' was found in `%s' %d times", - exceptedErrorsCount, connectErrorsRegexpString, err.Error(), len(connectErrors)) + if len(dialErr.DialErrors) != expectedErrorsCount { + t.Errorf("expected %d errors, got %d", expectedErrorsCount, len(dialErr.DialErrors)) } } diff --git a/go.mod b/go.mod index 15b27b0b9e37ab5e5318b7bae7c10c4b4fe069fb..413ba0e33ca6bb72b5f27547e19bbad6ed1a8492 100644 --- a/go.mod +++ b/go.mod @@ -1,7 +1,6 @@ module github.com/libp2p/go-libp2p-swarm require ( - github.com/hashicorp/go-multierror v1.0.0 github.com/ipfs/go-log v0.0.1 github.com/jbenet/goprocess v0.0.0-20160826012719-b497e2f366b8 github.com/libp2p/go-addr-util v0.0.1 diff --git a/go.sum b/go.sum index 2fae010ff22384c983e7c23ffc2481c922b88c65..2499b75b6a4289980f2fcf63b5145be51ac7fb91 100644 --- a/go.sum +++ b/go.sum @@ -31,10 +31,6 @@ github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyF github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU= github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc= github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48= -github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= -github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= -github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uPribsnS6o= -github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= diff --git a/swarm_dial.go b/swarm_dial.go index d4b315525f87727a41d96bac62723e6e506c8a08..3feeffe3fe11ed61e0067557f069c5224e9a5c9b 100644 --- a/swarm_dial.go +++ b/swarm_dial.go @@ -7,8 +7,6 @@ import ( "sync" "time" - "github.com/hashicorp/go-multierror" - logging "github.com/ipfs/go-log" addrutil "github.com/libp2p/go-addr-util" lgbl "github.com/libp2p/go-libp2p-loggables" @@ -34,15 +32,23 @@ var ( // been dialed too frequently ErrDialBackoff = errors.New("dial backoff") - // ErrDialFailed is returned when connecting to a peer has ultimately failed - ErrDialFailed = errors.New("dial attempt failed") - // ErrDialToSelf is returned if we attempt to dial our own peer ErrDialToSelf = errors.New("dial to self attempted") // ErrNoTransport is returned when we don't know a transport for the // given multiaddr. ErrNoTransport = errors.New("no transport for protocol") + + // ErrAllDialsFailed is returned when connecting to a peer has ultimately failed + ErrAllDialsFailed = errors.New("all dials failed") + + // ErrNoAddresses is returned when we fail to find any addresses for a + // peer we're trying to dial. + ErrNoAddresses = errors.New("no addresses") + + // ErrNoAddresses is returned when we find addresses for a peer but + // can't use any of them. + ErrNoGoodAddresses = errors.New("no good addresses") ) // DialAttempts governs how many times a goroutine will try to dial a given peer. @@ -256,7 +262,7 @@ func (s *Swarm) doDial(ctx context.Context, p peer.ID) (*Conn, error) { } // ok, we failed. - return nil, fmt.Errorf("dial attempt failed: %s", err) + return nil, err } return conn, nil } @@ -293,11 +299,11 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { */ peerAddrs := s.peers.Addrs(p) if len(peerAddrs) == 0 { - return nil, errors.New("no addresses") + return nil, &DialError{Peer: p, Cause: ErrNoAddresses} } goodAddrs := s.filterKnownUndialables(peerAddrs) if len(goodAddrs) == 0 { - return nil, errors.New("no good addresses") + return nil, &DialError{Peer: p, Cause: ErrNoGoodAddresses} } goodAddrsChan := make(chan ma.Multiaddr, len(goodAddrs)) for _, a := range goodAddrs { @@ -307,10 +313,18 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { ///////// // try to get a connection to any addr - connC, err := s.dialAddrs(ctx, p, goodAddrsChan) - if err != nil { - logdial["error"] = err.Error() - return nil, err + connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan) + if dialErr != nil { + logdial["error"] = dialErr.Cause.Error() + if dialErr.Cause == context.Canceled { + // always prefer the "context canceled" error. + // we rely on behing able to check `err == context.Canceled` + // + // Removing this will BREAK backoff (causing us to + // backoff when canceling dials). + return nil, context.Canceled + } + return nil, dialErr } logdial["conn"] = logging.Metadata{ "localAddr": connC.LocalMultiaddr(), @@ -320,7 +334,7 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { if err != nil { logdial["error"] = err.Error() connC.Close() // close the connection. didn't work out :( - return nil, err + return nil, &DialError{Peer: p, Cause: err} } logdial["dial"] = "success" @@ -352,7 +366,7 @@ func (s *Swarm) filterKnownUndialables(addrs []ma.Multiaddr) []ma.Multiaddr { ) } -func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, error) { +func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, *DialError) { log.Debugf("%s swarm dialing %s", s.local, p) ctx, cancel := context.WithCancel(ctx) @@ -360,26 +374,23 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. // use a single response type instead of errs and conns, reduces complexity *a ton* respch := make(chan dialResult) - var dialErrors *multierror.Error + err := new(DialError) defer s.limiter.clearAllPeerDials(p) var active int +dialLoop: for remoteAddrs != nil || active > 0 { // Check for context cancellations and/or responses first. select { case <-ctx.Done(): - if dialError := dialErrors.ErrorOrNil(); dialError != nil { - return nil, dialError - } - - return nil, ctx.Err() + break dialLoop case resp := <-respch: active-- if resp.Err != nil { // Errors are normal, lots of dials will fail log.Infof("got error on dial: %s", resp.Err) - dialErrors = multierror.Append(dialErrors, resp.Err) + err.recordErr(resp.Addr, resp.Err) } else if resp.Conn != nil { return resp.Conn, nil } @@ -400,28 +411,27 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. s.limitedDial(ctx, p, addr, respch) active++ case <-ctx.Done(): - if dialError := dialErrors.ErrorOrNil(); dialError != nil { - return nil, dialError - } - - return nil, ctx.Err() + break dialLoop case resp := <-respch: active-- if resp.Err != nil { // Errors are normal, lots of dials will fail log.Infof("got error on dial: %s", resp.Err) - dialErrors = multierror.Append(dialErrors, resp.Err) + err.recordErr(resp.Addr, resp.Err) } else if resp.Conn != nil { return resp.Conn, nil } } } - if dialError := dialErrors.ErrorOrNil(); dialError != nil { - return nil, dialError + if ctxErr := ctx.Err(); ctxErr != nil { + err.Cause = ctxErr + } else if len(err.DialErrors) == 0 { + err.Cause = inet.ErrNoRemoteAddrs + } else { + err.Cause = ErrAllDialsFailed } - - return nil, inet.ErrNoRemoteAddrs + return nil, err } // limitedDial will start a dial to the given peer when @@ -450,7 +460,7 @@ func (s *Swarm) dialAddr(ctx context.Context, p peer.ID, addr ma.Multiaddr) (tra connC, err := tpt.Dial(ctx, addr, p) if err != nil { - return nil, fmt.Errorf("%s --> %s (%s) dial attempt failed: %s", s.local, p, addr, err) + return nil, err } // Trust the transport? Yeah... right.