Unverified Commit 0fc01368 authored by Steven Allen's avatar Steven Allen Committed by GitHub

Merge pull request #121 from libp2p/feat/improve-errors

dial: return a nice custom dial error
parents 94a49f12 755d0634
package swarm
import (
"fmt"
"strings"
peer "github.com/libp2p/go-libp2p-peer"
ma "github.com/multiformats/go-multiaddr"
)
// maxDialDialErrors is the maximum number of dial errors we record
const maxDialDialErrors = 16
// DialError is the error type returned when dialing.
type DialError struct {
Peer peer.ID
DialErrors []TransportError
Cause error
Skipped int
}
func (e *DialError) recordErr(addr ma.Multiaddr, err error) {
if len(e.DialErrors) >= maxDialDialErrors {
e.Skipped++
return
}
e.DialErrors = append(e.DialErrors, TransportError{
Address: addr,
Cause: err,
})
}
func (e *DialError) Error() string {
var builder strings.Builder
fmt.Fprintf(&builder, "failed to dial %s:", e.Peer)
if e.Cause != nil {
fmt.Fprintf(&builder, " %s", e.Cause)
}
for _, te := range e.DialErrors {
fmt.Fprintf(&builder, "\n * [%s] %s", te.Address, te.Cause)
}
if e.Skipped > 0 {
fmt.Fprintf(&builder, "\n ... skipping %d errors ...", e.Skipped)
}
return builder.String()
}
// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper.
func (e *DialError) Unwrap() error {
// If we have a context error, that's the "ultimate" error.
if e.Cause != nil {
return e.Cause
}
return nil
}
var _ error = (*DialError)(nil)
// TransportError is the error returned when dialing a specific address.
type TransportError struct {
Address ma.Multiaddr
Cause error
}
func (e *TransportError) Error() string {
return fmt.Sprintf("failed to dial %s: %s", e.Address, e.Cause)
}
var _ error = (*TransportError)(nil)
...@@ -2,9 +2,7 @@ package swarm_test ...@@ -2,9 +2,7 @@ package swarm_test
import ( import (
"context" "context"
"fmt"
"net" "net"
"regexp"
"sync" "sync"
"testing" "testing"
"time" "time"
...@@ -491,8 +489,8 @@ func TestDialPeerFailed(t *testing.T) { ...@@ -491,8 +489,8 @@ func TestDialPeerFailed(t *testing.T) {
defer closeSwarms(swarms) defer closeSwarms(swarms)
testedSwarm, targetSwarm := swarms[0], swarms[1] testedSwarm, targetSwarm := swarms[0], swarms[1]
exceptedErrorsCount := 5 expectedErrorsCount := 5
for i := 0; i < exceptedErrorsCount; i++ { for i := 0; i < expectedErrorsCount; i++ {
_, silentPeerAddress, silentPeerListener := newSilentPeer(t) _, silentPeerAddress, silentPeerListener := newSilentPeer(t)
go acceptAndHang(silentPeerListener) go acceptAndHang(silentPeerListener)
defer silentPeerListener.Close() defer silentPeerListener.Close()
...@@ -508,23 +506,17 @@ func TestDialPeerFailed(t *testing.T) { ...@@ -508,23 +506,17 @@ func TestDialPeerFailed(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
// dial_test.go:508: correctly get a combined error: dial attempt failed: 10 errors occurred: // dial_test.go:508: correctly get a combined error: failed to dial PEER: all dials failed
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/46485) dial attempt failed: failed to negotiate security protocol: context deadline exceeded // * [/ip4/127.0.0.1/tcp/46485] failed to negotiate security protocol: context deadline exceeded
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/34881) dial attempt failed: failed to negotiate security protocol: context deadline exceeded // * [/ip4/127.0.0.1/tcp/34881] failed to negotiate security protocol: context deadline exceeded
// ... // ...
errorCountRegexpString := fmt.Sprintf("%d errors occurred", exceptedErrorsCount) dialErr, ok := err.(*DialError)
errorCountRegexp := regexp.MustCompile(errorCountRegexpString) if !ok {
if !errorCountRegexp.MatchString(err.Error()) { t.Fatalf("expected *DialError, got %T", err)
t.Fatalf("can't find total err count: `%s' in `%s'", errorCountRegexpString, err.Error())
} }
connectErrorsRegexpString := `\* <peer\.ID .+?> --> <peer\.ID .+?> \(.+?\) dial attempt failed:.+` if len(dialErr.DialErrors) != expectedErrorsCount {
connectErrorsRegexp := regexp.MustCompile(connectErrorsRegexpString) t.Errorf("expected %d errors, got %d", expectedErrorsCount, len(dialErr.DialErrors))
connectErrors := connectErrorsRegexp.FindAll([]byte(err.Error()), -1)
if len(connectErrors) != exceptedErrorsCount {
t.Fatalf("connectErrors must contain %d errros; "+
"but `%s' was found in `%s' %d times",
exceptedErrorsCount, connectErrorsRegexpString, err.Error(), len(connectErrors))
} }
} }
...@@ -31,10 +31,6 @@ github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyF ...@@ -31,10 +31,6 @@ github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyF
github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU= github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU=
github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc= github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc=
github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48= github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uPribsnS6o=
github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
......
...@@ -7,8 +7,6 @@ import ( ...@@ -7,8 +7,6 @@ import (
"sync" "sync"
"time" "time"
"github.com/hashicorp/go-multierror"
logging "github.com/ipfs/go-log" logging "github.com/ipfs/go-log"
addrutil "github.com/libp2p/go-addr-util" addrutil "github.com/libp2p/go-addr-util"
lgbl "github.com/libp2p/go-libp2p-loggables" lgbl "github.com/libp2p/go-libp2p-loggables"
...@@ -34,15 +32,23 @@ var ( ...@@ -34,15 +32,23 @@ var (
// been dialed too frequently // been dialed too frequently
ErrDialBackoff = errors.New("dial backoff") ErrDialBackoff = errors.New("dial backoff")
// ErrDialFailed is returned when connecting to a peer has ultimately failed
ErrDialFailed = errors.New("dial attempt failed")
// ErrDialToSelf is returned if we attempt to dial our own peer // ErrDialToSelf is returned if we attempt to dial our own peer
ErrDialToSelf = errors.New("dial to self attempted") ErrDialToSelf = errors.New("dial to self attempted")
// ErrNoTransport is returned when we don't know a transport for the // ErrNoTransport is returned when we don't know a transport for the
// given multiaddr. // given multiaddr.
ErrNoTransport = errors.New("no transport for protocol") ErrNoTransport = errors.New("no transport for protocol")
// ErrAllDialsFailed is returned when connecting to a peer has ultimately failed
ErrAllDialsFailed = errors.New("all dials failed")
// ErrNoAddresses is returned when we fail to find any addresses for a
// peer we're trying to dial.
ErrNoAddresses = errors.New("no addresses")
// ErrNoAddresses is returned when we find addresses for a peer but
// can't use any of them.
ErrNoGoodAddresses = errors.New("no good addresses")
) )
// DialAttempts governs how many times a goroutine will try to dial a given peer. // DialAttempts governs how many times a goroutine will try to dial a given peer.
...@@ -256,7 +262,7 @@ func (s *Swarm) doDial(ctx context.Context, p peer.ID) (*Conn, error) { ...@@ -256,7 +262,7 @@ func (s *Swarm) doDial(ctx context.Context, p peer.ID) (*Conn, error) {
} }
// ok, we failed. // ok, we failed.
return nil, fmt.Errorf("dial attempt failed: %s", err) return nil, err
} }
return conn, nil return conn, nil
} }
...@@ -293,11 +299,11 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { ...@@ -293,11 +299,11 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
*/ */
peerAddrs := s.peers.Addrs(p) peerAddrs := s.peers.Addrs(p)
if len(peerAddrs) == 0 { if len(peerAddrs) == 0 {
return nil, errors.New("no addresses") return nil, &DialError{Peer: p, Cause: ErrNoAddresses}
} }
goodAddrs := s.filterKnownUndialables(peerAddrs) goodAddrs := s.filterKnownUndialables(peerAddrs)
if len(goodAddrs) == 0 { if len(goodAddrs) == 0 {
return nil, errors.New("no good addresses") return nil, &DialError{Peer: p, Cause: ErrNoGoodAddresses}
} }
goodAddrsChan := make(chan ma.Multiaddr, len(goodAddrs)) goodAddrsChan := make(chan ma.Multiaddr, len(goodAddrs))
for _, a := range goodAddrs { for _, a := range goodAddrs {
...@@ -307,10 +313,18 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { ...@@ -307,10 +313,18 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
///////// /////////
// try to get a connection to any addr // try to get a connection to any addr
connC, err := s.dialAddrs(ctx, p, goodAddrsChan) connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan)
if err != nil { if dialErr != nil {
logdial["error"] = err.Error() logdial["error"] = dialErr.Cause.Error()
return nil, err if dialErr.Cause == context.Canceled {
// always prefer the "context canceled" error.
// we rely on behing able to check `err == context.Canceled`
//
// Removing this will BREAK backoff (causing us to
// backoff when canceling dials).
return nil, context.Canceled
}
return nil, dialErr
} }
logdial["conn"] = logging.Metadata{ logdial["conn"] = logging.Metadata{
"localAddr": connC.LocalMultiaddr(), "localAddr": connC.LocalMultiaddr(),
...@@ -320,7 +334,7 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { ...@@ -320,7 +334,7 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
if err != nil { if err != nil {
logdial["error"] = err.Error() logdial["error"] = err.Error()
connC.Close() // close the connection. didn't work out :( connC.Close() // close the connection. didn't work out :(
return nil, err return nil, &DialError{Peer: p, Cause: err}
} }
logdial["dial"] = "success" logdial["dial"] = "success"
...@@ -352,7 +366,7 @@ func (s *Swarm) filterKnownUndialables(addrs []ma.Multiaddr) []ma.Multiaddr { ...@@ -352,7 +366,7 @@ func (s *Swarm) filterKnownUndialables(addrs []ma.Multiaddr) []ma.Multiaddr {
) )
} }
func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, error) { func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, *DialError) {
log.Debugf("%s swarm dialing %s", s.local, p) log.Debugf("%s swarm dialing %s", s.local, p)
ctx, cancel := context.WithCancel(ctx) ctx, cancel := context.WithCancel(ctx)
...@@ -360,26 +374,23 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. ...@@ -360,26 +374,23 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.
// use a single response type instead of errs and conns, reduces complexity *a ton* // use a single response type instead of errs and conns, reduces complexity *a ton*
respch := make(chan dialResult) respch := make(chan dialResult)
var dialErrors *multierror.Error err := new(DialError)
defer s.limiter.clearAllPeerDials(p) defer s.limiter.clearAllPeerDials(p)
var active int var active int
dialLoop:
for remoteAddrs != nil || active > 0 { for remoteAddrs != nil || active > 0 {
// Check for context cancellations and/or responses first. // Check for context cancellations and/or responses first.
select { select {
case <-ctx.Done(): case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil { break dialLoop
return nil, dialError
}
return nil, ctx.Err()
case resp := <-respch: case resp := <-respch:
active-- active--
if resp.Err != nil { if resp.Err != nil {
// Errors are normal, lots of dials will fail // Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err) log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err) err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil { } else if resp.Conn != nil {
return resp.Conn, nil return resp.Conn, nil
} }
...@@ -400,28 +411,27 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. ...@@ -400,28 +411,27 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.
s.limitedDial(ctx, p, addr, respch) s.limitedDial(ctx, p, addr, respch)
active++ active++
case <-ctx.Done(): case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil { break dialLoop
return nil, dialError
}
return nil, ctx.Err()
case resp := <-respch: case resp := <-respch:
active-- active--
if resp.Err != nil { if resp.Err != nil {
// Errors are normal, lots of dials will fail // Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err) log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err) err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil { } else if resp.Conn != nil {
return resp.Conn, nil return resp.Conn, nil
} }
} }
} }
if dialError := dialErrors.ErrorOrNil(); dialError != nil { if ctxErr := ctx.Err(); ctxErr != nil {
return nil, dialError err.Cause = ctxErr
} else if len(err.DialErrors) == 0 {
err.Cause = inet.ErrNoRemoteAddrs
} else {
err.Cause = ErrAllDialsFailed
} }
return nil, err
return nil, inet.ErrNoRemoteAddrs
} }
// limitedDial will start a dial to the given peer when // limitedDial will start a dial to the given peer when
...@@ -450,7 +460,7 @@ func (s *Swarm) dialAddr(ctx context.Context, p peer.ID, addr ma.Multiaddr) (tra ...@@ -450,7 +460,7 @@ func (s *Swarm) dialAddr(ctx context.Context, p peer.ID, addr ma.Multiaddr) (tra
connC, err := tpt.Dial(ctx, addr, p) connC, err := tpt.Dial(ctx, addr, p)
if err != nil { if err != nil {
return nil, fmt.Errorf("%s --> %s (%s) dial attempt failed: %s", s.local, p, addr, err) return nil, err
} }
// Trust the transport? Yeah... right. // Trust the transport? Yeah... right.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment