Commit 755d0634 authored by Steven Allen's avatar Steven Allen

dial: return a nice custom dial error

* Limits the number of dial errors we keep
* Exposes the dial errors
* Avoids string formatting unless we actually need to.

Helps address https://github.com/libp2p/go-libp2p-swarm/issues/119
parent 94a49f12
package swarm
import (
"fmt"
"strings"
peer "github.com/libp2p/go-libp2p-peer"
ma "github.com/multiformats/go-multiaddr"
)
// maxDialDialErrors is the maximum number of dial errors we record
const maxDialDialErrors = 16
// DialError is the error type returned when dialing.
type DialError struct {
Peer peer.ID
DialErrors []TransportError
Cause error
Skipped int
}
func (e *DialError) recordErr(addr ma.Multiaddr, err error) {
if len(e.DialErrors) >= maxDialDialErrors {
e.Skipped++
return
}
e.DialErrors = append(e.DialErrors, TransportError{
Address: addr,
Cause: err,
})
}
func (e *DialError) Error() string {
var builder strings.Builder
fmt.Fprintf(&builder, "failed to dial %s:", e.Peer)
if e.Cause != nil {
fmt.Fprintf(&builder, " %s", e.Cause)
}
for _, te := range e.DialErrors {
fmt.Fprintf(&builder, "\n * [%s] %s", te.Address, te.Cause)
}
if e.Skipped > 0 {
fmt.Fprintf(&builder, "\n ... skipping %d errors ...", e.Skipped)
}
return builder.String()
}
// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper.
func (e *DialError) Unwrap() error {
// If we have a context error, that's the "ultimate" error.
if e.Cause != nil {
return e.Cause
}
return nil
}
var _ error = (*DialError)(nil)
// TransportError is the error returned when dialing a specific address.
type TransportError struct {
Address ma.Multiaddr
Cause error
}
func (e *TransportError) Error() string {
return fmt.Sprintf("failed to dial %s: %s", e.Address, e.Cause)
}
var _ error = (*TransportError)(nil)
......@@ -2,9 +2,7 @@ package swarm_test
import (
"context"
"fmt"
"net"
"regexp"
"sync"
"testing"
"time"
......@@ -491,8 +489,8 @@ func TestDialPeerFailed(t *testing.T) {
defer closeSwarms(swarms)
testedSwarm, targetSwarm := swarms[0], swarms[1]
exceptedErrorsCount := 5
for i := 0; i < exceptedErrorsCount; i++ {
expectedErrorsCount := 5
for i := 0; i < expectedErrorsCount; i++ {
_, silentPeerAddress, silentPeerListener := newSilentPeer(t)
go acceptAndHang(silentPeerListener)
defer silentPeerListener.Close()
......@@ -508,23 +506,17 @@ func TestDialPeerFailed(t *testing.T) {
t.Fatal(err)
}
// dial_test.go:508: correctly get a combined error: dial attempt failed: 10 errors occurred:
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/46485) dial attempt failed: failed to negotiate security protocol: context deadline exceeded
// * <peer.ID Qm*Wpwtvc> --> <peer.ID Qm*cc2FQR> (/ip4/127.0.0.1/tcp/34881) dial attempt failed: failed to negotiate security protocol: context deadline exceeded
// dial_test.go:508: correctly get a combined error: failed to dial PEER: all dials failed
// * [/ip4/127.0.0.1/tcp/46485] failed to negotiate security protocol: context deadline exceeded
// * [/ip4/127.0.0.1/tcp/34881] failed to negotiate security protocol: context deadline exceeded
// ...
errorCountRegexpString := fmt.Sprintf("%d errors occurred", exceptedErrorsCount)
errorCountRegexp := regexp.MustCompile(errorCountRegexpString)
if !errorCountRegexp.MatchString(err.Error()) {
t.Fatalf("can't find total err count: `%s' in `%s'", errorCountRegexpString, err.Error())
dialErr, ok := err.(*DialError)
if !ok {
t.Fatalf("expected *DialError, got %T", err)
}
connectErrorsRegexpString := `\* <peer\.ID .+?> --> <peer\.ID .+?> \(.+?\) dial attempt failed:.+`
connectErrorsRegexp := regexp.MustCompile(connectErrorsRegexpString)
connectErrors := connectErrorsRegexp.FindAll([]byte(err.Error()), -1)
if len(connectErrors) != exceptedErrorsCount {
t.Fatalf("connectErrors must contain %d errros; "+
"but `%s' was found in `%s' %d times",
exceptedErrorsCount, connectErrorsRegexpString, err.Error(), len(connectErrors))
if len(dialErr.DialErrors) != expectedErrorsCount {
t.Errorf("expected %d errors, got %d", expectedErrorsCount, len(dialErr.DialErrors))
}
}
......@@ -7,8 +7,6 @@ import (
"sync"
"time"
"github.com/hashicorp/go-multierror"
logging "github.com/ipfs/go-log"
addrutil "github.com/libp2p/go-addr-util"
lgbl "github.com/libp2p/go-libp2p-loggables"
......@@ -34,15 +32,23 @@ var (
// been dialed too frequently
ErrDialBackoff = errors.New("dial backoff")
// ErrDialFailed is returned when connecting to a peer has ultimately failed
ErrDialFailed = errors.New("dial attempt failed")
// ErrDialToSelf is returned if we attempt to dial our own peer
ErrDialToSelf = errors.New("dial to self attempted")
// ErrNoTransport is returned when we don't know a transport for the
// given multiaddr.
ErrNoTransport = errors.New("no transport for protocol")
// ErrAllDialsFailed is returned when connecting to a peer has ultimately failed
ErrAllDialsFailed = errors.New("all dials failed")
// ErrNoAddresses is returned when we fail to find any addresses for a
// peer we're trying to dial.
ErrNoAddresses = errors.New("no addresses")
// ErrNoAddresses is returned when we find addresses for a peer but
// can't use any of them.
ErrNoGoodAddresses = errors.New("no good addresses")
)
// DialAttempts governs how many times a goroutine will try to dial a given peer.
......@@ -256,7 +262,7 @@ func (s *Swarm) doDial(ctx context.Context, p peer.ID) (*Conn, error) {
}
// ok, we failed.
return nil, fmt.Errorf("dial attempt failed: %s", err)
return nil, err
}
return conn, nil
}
......@@ -293,11 +299,11 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
*/
peerAddrs := s.peers.Addrs(p)
if len(peerAddrs) == 0 {
return nil, errors.New("no addresses")
return nil, &DialError{Peer: p, Cause: ErrNoAddresses}
}
goodAddrs := s.filterKnownUndialables(peerAddrs)
if len(goodAddrs) == 0 {
return nil, errors.New("no good addresses")
return nil, &DialError{Peer: p, Cause: ErrNoGoodAddresses}
}
goodAddrsChan := make(chan ma.Multiaddr, len(goodAddrs))
for _, a := range goodAddrs {
......@@ -307,10 +313,18 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
/////////
// try to get a connection to any addr
connC, err := s.dialAddrs(ctx, p, goodAddrsChan)
if err != nil {
logdial["error"] = err.Error()
return nil, err
connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan)
if dialErr != nil {
logdial["error"] = dialErr.Cause.Error()
if dialErr.Cause == context.Canceled {
// always prefer the "context canceled" error.
// we rely on behing able to check `err == context.Canceled`
//
// Removing this will BREAK backoff (causing us to
// backoff when canceling dials).
return nil, context.Canceled
}
return nil, dialErr
}
logdial["conn"] = logging.Metadata{
"localAddr": connC.LocalMultiaddr(),
......@@ -320,7 +334,7 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
if err != nil {
logdial["error"] = err.Error()
connC.Close() // close the connection. didn't work out :(
return nil, err
return nil, &DialError{Peer: p, Cause: err}
}
logdial["dial"] = "success"
......@@ -352,7 +366,7 @@ func (s *Swarm) filterKnownUndialables(addrs []ma.Multiaddr) []ma.Multiaddr {
)
}
func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, error) {
func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.Multiaddr) (transport.Conn, *DialError) {
log.Debugf("%s swarm dialing %s", s.local, p)
ctx, cancel := context.WithCancel(ctx)
......@@ -360,26 +374,23 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.
// use a single response type instead of errs and conns, reduces complexity *a ton*
respch := make(chan dialResult)
var dialErrors *multierror.Error
err := new(DialError)
defer s.limiter.clearAllPeerDials(p)
var active int
dialLoop:
for remoteAddrs != nil || active > 0 {
// Check for context cancellations and/or responses first.
select {
case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
}
return nil, ctx.Err()
break dialLoop
case resp := <-respch:
active--
if resp.Err != nil {
// Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err)
err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil {
return resp.Conn, nil
}
......@@ -400,28 +411,27 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma.
s.limitedDial(ctx, p, addr, respch)
active++
case <-ctx.Done():
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
}
return nil, ctx.Err()
break dialLoop
case resp := <-respch:
active--
if resp.Err != nil {
// Errors are normal, lots of dials will fail
log.Infof("got error on dial: %s", resp.Err)
dialErrors = multierror.Append(dialErrors, resp.Err)
err.recordErr(resp.Addr, resp.Err)
} else if resp.Conn != nil {
return resp.Conn, nil
}
}
}
if dialError := dialErrors.ErrorOrNil(); dialError != nil {
return nil, dialError
if ctxErr := ctx.Err(); ctxErr != nil {
err.Cause = ctxErr
} else if len(err.DialErrors) == 0 {
err.Cause = inet.ErrNoRemoteAddrs
} else {
err.Cause = ErrAllDialsFailed
}
return nil, inet.ErrNoRemoteAddrs
return nil, err
}
// limitedDial will start a dial to the given peer when
......@@ -450,7 +460,7 @@ func (s *Swarm) dialAddr(ctx context.Context, p peer.ID, addr ma.Multiaddr) (tra
connC, err := tpt.Dial(ctx, addr, p)
if err != nil {
return nil, fmt.Errorf("%s --> %s (%s) dial attempt failed: %s", s.local, p, addr, err)
return nil, err
}
// Trust the transport? Yeah... right.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment