Unverified Commit 4440884d authored by Steven Allen's avatar Steven Allen Committed by GitHub

Merge pull request #145 from libp2p/feat/improve-dial-error

improve dial errors
parents 5efa2fa2 83dd28eb
......@@ -2,6 +2,7 @@ package swarm
import (
"fmt"
"os"
"strings"
"github.com/libp2p/go-libp2p-core/peer"
......@@ -20,6 +21,10 @@ type DialError struct {
Skipped int
}
func (e *DialError) Timeout() bool {
return os.IsTimeout(e.Cause)
}
func (e *DialError) recordErr(addr ma.Multiaddr, err error) {
if len(e.DialErrors) >= maxDialDialErrors {
e.Skipped++
......@@ -48,11 +53,7 @@ func (e *DialError) Error() string {
// Unwrap implements https://godoc.org/golang.org/x/xerrors#Wrapper.
func (e *DialError) Unwrap() error {
// If we have a context error, that's the "ultimate" error.
if e.Cause != nil {
return e.Cause
}
return nil
return e.Cause
}
var _ error = (*DialError)(nil)
......
......@@ -2,11 +2,15 @@ package swarm
import (
"context"
"errors"
"sync"
"github.com/libp2p/go-libp2p-core/peer"
)
// TODO: change this text when we fix the bug
var errDialCanceled = errors.New("dial was aborted internally, likely due to https://git.io/Je2wW")
// DialFunc is the type of function expected by DialSync.
type DialFunc func(context.Context, peer.ID) (*Conn, error)
......@@ -78,6 +82,16 @@ func (ad *activeDial) decref() {
func (ad *activeDial) start(ctx context.Context) {
ad.conn, ad.err = ad.ds.dialFunc(ctx, ad.id)
// This isn't the user's context so we should fix the error.
switch ad.err {
case context.Canceled:
// The dial was canceled with `CancelDial`.
ad.err = errDialCanceled
case context.DeadlineExceeded:
// We hit an internal timeout, not a context timeout.
ad.err = ErrDialTimeout
}
close(ad.waitch)
ad.cancel()
}
......
......@@ -40,6 +40,9 @@ var ErrSwarmClosed = errors.New("swarm closed")
// transport is misbehaving.
var ErrAddrFiltered = errors.New("address filtered")
// ErrDialTimeout is returned when one a dial times out due to the global timeout
var ErrDialTimeout = errors.New("dial timed out")
// Swarm is a connection muxer, allowing connections to other peers to
// be opened and closed, while still using the same Chan for all
// communication. The Chan sends/receives Messages, which note the
......
......@@ -221,12 +221,23 @@ func (s *Swarm) dialPeer(ctx context.Context, p peer.ID) (*Conn, error) {
defer cancel()
conn, err = s.dsync.DialLock(ctx, p)
if err != nil {
return nil, err
if err == nil {
return conn, nil
}
log.Debugf("network for %s finished dialing %s", s.local, p)
return conn, err
if ctx.Err() != nil {
// Context error trumps any dial errors as it was likely the ultimate cause.
return nil, ctx.Err()
}
if s.ctx.Err() != nil {
// Ok, so the swarm is shutting down.
return nil, ErrSwarmClosed
}
return nil, err
}
// doDial is an ugly shim method to retain all the logging and backoff logic
......@@ -317,13 +328,14 @@ func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
connC, dialErr := s.dialAddrs(ctx, p, goodAddrsChan)
if dialErr != nil {
logdial["error"] = dialErr.Cause.Error()
if dialErr.Cause == context.Canceled {
// always prefer the "context canceled" error.
// we rely on behing able to check `err == context.Canceled`
switch dialErr.Cause {
case context.Canceled, context.DeadlineExceeded:
// Always prefer the context errors as we rely on being
// able to check them.
//
// Removing this will BREAK backoff (causing us to
// backoff when canceling dials).
return nil, context.Canceled
return nil, dialErr.Cause
}
return nil, dialErr
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment