Merge pull request #47 from ipld/path-clarifications

Path clarifications

Merge pull request #47 from ipld/path-clarifications
Path clarifications
4612357c · Eric Myhre · GitHub · 530ccd68 · 8093be74 · 4612357c
Unverified Commit 4612357c authored Feb 27, 2020 by Eric Myhre Committed by GitHub Feb 27, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 151 additions and 34 deletions

path.go path.go +98 -14

pathSegment.go pathSegment.go +36 -13

path_test.go path_test.go +17 -7

No files found.
--- a/path.go
+++ b/path.go
@@ -4,37 +4,109 @@ import (
 	"strings"
 )

-// Path is used in describing progress in a traversal;
-// and can also be used as an instruction for a specific traverse.
+// Path describes a series of steps across a tree or DAG of Node,
+// where each segment in the path is a map key or list index
+// (literaly, Path is a slice of PathSegment values).
+// Path is used in describing progress in a traversal; and
+// can also be used as an instruction for traversing from one Node to another.
+// Path values will also often be encountered as part of error messages.
 //
+// (Note that Paths are useful as an instruction for traversing from
+// *one* Node to *one* other Node; to do a walk from one Node and visit
+// *several* Nodes based on some sort of pattern, look to IPLD Selectors,
+// and the 'traversal/selector' package in this project.)
+//
+// Path values are always relative.
+// Observe how 'traversal.Focus' requires both a Node and a Path argument --
+// where to start, and where to go, respectively.
+// Similarly, error values which include a Path will be speaking in reference
+// to the "starting Node" in whatever context they arose from.
+//
+// The canonical form of a Path is as a list of PathSegment.
+// Each PathSegment is a string; by convention, the string should be
+// in UTF-8 encoding and use NFC normalization, but all operations
+// will regard the string as its constituent eight-bit bytes.
+//
+// There are no illegal or magical characters in IPLD Paths
+// (in particular, do not mistake them for UNIX system paths).
 // IPLD Paths can only go down: that is, each segment must traverse one node.
 // There is no ".." which means "go up";
-// and there is no "." which means "stay here";
-// and it is not valid to have an empty path segment.
+// and there is no "." which means "stay here".
+// IPLD Paths have no magic behavior around characters such as "~".
+// IPLD Paths do not have a concept of "globs" nor behave specially
+// for a path segment string of "*" (but you may wish to see 'Selectors'
+// for globbing-like features that traverse over IPLD data).
+//
+// An empty string is a valid PathSegment.
+// (This leads to some unfortunate complications when wishing to represent
+// paths in a simple string format; however, consider that maps do exist
+// in serialized data in the wild where an empty string is used as the key:
+// it is important we be able to correctly describe and address this!)
 //
-// (Note: path strings as interpreted by UnixFS may certainly have concepts
-// of ".." and "."!  But UnixFS is built upon IPLD; IPLD has no idea of this.)
+// A string containing "/" (or even being simply "/"!) is a valid PathSegment.
+// (As with empty strings, this is unfortunate (in particular, because it
+// very much doesn't match up well with expectations popularized by UNIX-like
+// filesystems); but, as with empty strings, maps which contain such a key
+// certainly exist, and it is important that we be able to regard them!)
 //
-// Paths are representable as strings.  When represented as a string, each
-// segment is separated by a "/" character.
-// (It follows that path segments may not themselves contain a "/" character.)
-// (Note: escaping may be specified and supported in the future; currently, it is not.)
+// A string starting, ending, or otherwise containing the NUL (\x00) byte
+// is also a valid PathSegment.  This follows from the rule of "a string is
+// regarded as its constituent eight-bit bytes": an all-zero byte is not exceptional.
+// In golang, this doesn't pose particular difficulty, but note this would be
+// of marked concern for languages which have "C-style nul-terminated strings".
 //
+// For an IPLD Path to be represented as a string, an encoding system
+// including escaping is necessary.  At present, there is not a single
+// canonical specification for such an escaping; we expect to decide one
+// in the future, but this is not yet settled and done.
+// (This implementation has a 'String' method, but it contains caveats
+// and may be ambiguous for some content.  This may be fixed in the future.)
 type Path struct {
 	segments []PathSegment
 }

-// ParsePath converts a string to an IPLD Path, parsing the string into a segmented Path.
+// NewPath returns a Path composed of the given segments.
 //
-// Each segment of the path string should be separated by a "/" character.
+// This constructor function does a defensive copy,
+// in case your segments slice should mutate in the future.
+// (Use NewPathNocopy if this is a performance concern,
+// and you're sure you know what you're doing.)
+func NewPath(segments []PathSegment) Path {
+	p := Path{make([]PathSegment, len(segments))}
+	copy(p.segments, segments)
+	return p
+}
+
+// NewPathNocopy is identical to NewPath but trusts that
+// the segments slice you provide will not be mutated.
+func NewPathNocopy(segments []PathSegment) Path {
+	return Path{segments}
+}
+
+// ParsePath converts a string to an IPLD Path, doing a basic parsing of the
+// string using "/" as a delimiter to produce a segmented Path.
+// This is a handy, but not a general-purpose nor spec-compliant (!),
+// way to create a Path: it cannot represent all valid paths.
 //
 // Multiple subsequent "/" characters will be silently collapsed.
 // E.g., `"foo///bar"` will be treated equivalently to `"foo/bar"`.
 // Prefixed and suffixed extraneous "/" characters are also discarded.
+// This makes this constructor incapable of handling some possible Path values
+// (specifically: paths with empty segements cannot be created with this constructor).
+//
+// There is no escaping mechanism used by this function.
+// This makes this constructor incapable of handling some possible Path values
+// (specifically, a path segment containing "/" cannot be created, because it
+// will always be intepreted as a segment separator).
 //
-// No "cleaning" of the path occurs.  See the documentation of the Path struct;
+// No other "cleaning" of the path occurs.  See the documentation of the Path struct;
 // in particular, note that ".." does not mean "go up", nor does "." mean "stay here" --
-// correspondingly, there isn't anything to "clean".
+// correspondingly, there isn't anything to "clean" in the same sense as
+// 'filepath.Clean' from the standard library filesystem path packages would.
+//
+// If the provided string contains unprintable characters, or non-UTF-8
+// or non-NFC-canonicalized bytes, no remark will be made about this,
+// and those bytes will remain part of the PathSegments in the resulting Path.
 func ParsePath(pth string) Path {
 	// FUTURE: we should probably have some escaping mechanism which makes
 	//  it possible to encode a slash in a segment.  Specification needed.
@@ -49,6 +121,18 @@ func ParsePath(pth string) Path {

 // String representation of a Path is simply the join of each segment with '/'.
 // It does not include a leading nor trailing slash.
+//
+// This is a handy, but not a general-purpose nor spec-compliant (!),
+// way to reduce a Path to a string.
+// There is no escaping mechanism used by this function,
+// and as a result, not all possible valid Path values (such as those with
+// empty segments or with segments containing "/") can be encoded unambiguously.
+// For Path values containing these problematic segments, ParsePath applied
+// to the string returned from this function may return a nonequal Path value.
+//
+// No escaping for unprintable characters is provided.
+// No guarantee that the resulting string is UTF-8 nor NFC canonicalized
+// is provided unless all the constituent PathSegment had those properties.
 func (p Path) String() string {
 	l := len(p.segments)
 	if l == 0 {

--- a/pathSegment.go
+++ b/pathSegment.go
@@ -6,6 +6,12 @@ import (

 // PathSegment can describe either a key in a map, or an index in a list.
 //
+// Create a PathSegment via either ParsePathSegment, PathSegmentOfString,
+// or PathSegmentOfInt; or, via one of the constructors of Path,
+// which will implicitly create PathSegment internally.
+// Using PathSegment's natural zero value directly is discouraged
+// (it will act like ParsePathSegment("0"), which likely not what you'd expect).
+//
 // Path segments are "stringly typed" -- they may be interpreted as either strings or ints depending on context.
 // A path segment of "123" will be used as a string when traversing a node of map kind;
 // and it will be converted to an integer when traversing a node of list kind.
@@ -15,10 +21,21 @@ import (
 // Internally, PathSegment will store either a string or an integer,
 // depending on how it was constructed,
 // and will automatically convert to the other on request.
-// (This means if two pieces of code communicate using PathSegment, one producing ints and the other expecting ints, they will work together efficiently.)
+// (This means if two pieces of code communicate using PathSegment,
+// one producing ints and the other expecting ints,
+// then they will work together efficiently.)
 // PathSegment in a Path produced by ParsePath generally have all strings internally,
-// because there is distinction possible when parsing a Path string
-// (and attempting to pre-parse all strings into ints "in case" would waste time in almost all cases).
+// because there is no distinction possible when parsing a Path string
+// (and attempting to pre-parse all strings into ints "just in case" would waste time in almost all cases).
+//
+// Be cautious of attempting to use PathSegment as a map key!
+// Due to the implementation detail of internal storage, it's possible for
+// PathSegment values which are "equal" per PathSegment.Equal's definition
+// to still be unequal in the eyes of golang's native maps.
+// You should probably use the string values of the PathSegment as map keys.
+// (This has the additional bonus of hitting a special fastpath that the golang
+// built-in maps have specifically for plain string keys.)
+//
 type PathSegment struct {
 	/*
 		A quick implementation note about the Go compiler and "union" semantics:
@@ -43,9 +60,10 @@ type PathSegment struct {
 		we're using the first tactic.

 		(We also currently get away with having no extra discriminator bit
-		because empty string is not considered a valid segment,
+		because we use a signed int for indexes, and negative values aren't valid there,
 		and thus we can use it as a sentinel value.
-		This may change if the IPLD Path spec comes to other conclusions about this.)
+		(Fun note: Empty strings were originally used for this sentinel,
+		but it turns out empty strings are valid PathSegment themselves, so!))
 	*/

 	s string
@@ -57,16 +75,16 @@ type PathSegment struct {
 // (Note: there is currently no escaping specified for PathSegments,
 // so this is currently functionally equivalent to PathSegmentOfString.)
 func ParsePathSegment(s string) PathSegment {
-	return PathSegment{s: s}
+	return PathSegment{s: s, i: -1}
 }

-// PathSegmentOfString boxes a string into a PathSegement.
+// PathSegmentOfString boxes a string into a PathSegment.
 // It does not attempt to parse any escaping; use ParsePathSegment for that.
 func PathSegmentOfString(s string) PathSegment {
-	return PathSegment{s: s}
+	return PathSegment{s: s, i: -1}
 }

-// PathSegmentOfString boxes an int into a PathSegement.
+// PathSegmentOfString boxes an int into a PathSegment.
 func PathSegmentOfInt(i int) PathSegment {
 	return PathSegment{i: i}
 }
@@ -75,7 +93,7 @@ func PathSegmentOfInt(i int) PathSegment {
 // but this is considered an implementation detail that's non-semantic.
 // If it returns false, it implicitly means "containsInt", as these are the only options.
 func (ps PathSegment) containsString() bool {
-	return ps.s != ""
+	return ps.i < 0
 }

 // String returns the PathSegment as a string.
@@ -102,9 +120,14 @@ func (ps PathSegment) Index() (int, error) {
 }

 // Equals checks if two PathSegment values are equal.
-// This is equivalent to checking if their strings are equal --
-// if one of the PathSegment values is backed by an int and the other is a string,
-// they may still be "equal".
+//
+// Because PathSegment is "stringly typed", this comparison does not
+// regard if one of the segments is stored as a string and one is stored as an int;
+// if string values of two segments are equal, they are "equal" overall.
+// In other words, `PathSegmentOfInt(2).Equals(PathSegmentOfString("2")) == true`!
+// (You should still typically prefer this method over converting two segments
+// to string and comparing those, because even though that may be functionally
+// correct, this method will be faster if they're both ints internally.)
 func (x PathSegment) Equals(o PathSegment) bool {
 	if !x.containsString() && !o.containsString() {
 		return x.i == o.i

--- a/path_test.go
+++ b/path_test.go
@@ -8,18 +8,28 @@ import (

 func TestParsePath(t *testing.T) {
 	t.Run("parsing one segment", func(t *testing.T) {
-		Wish(t, ParsePath("0").segments, ShouldEqual, []PathSegment{{s: "0"}})
+		Wish(t, ParsePath("0").segments, ShouldEqual, []PathSegment{{s: "0", i: -1}})
 	})
 	t.Run("parsing three segments", func(t *testing.T) {
-		Wish(t, ParsePath("0/foo/2").segments, ShouldEqual, []PathSegment{{s: "0"}, {s: "foo"}, {s: "2"}})
-	})
-	t.Run("eliding empty segments", func(t *testing.T) {
-		Wish(t, ParsePath("0//2").segments, ShouldEqual, []PathSegment{{s: "0"}, {s: "2"}})
+		Wish(t, ParsePath("0/foo/2").segments, ShouldEqual, []PathSegment{{s: "0", i: -1}, {s: "foo", i: -1}, {s: "2", i: -1}})
 	})
 	t.Run("eliding leading slashes", func(t *testing.T) {
-		Wish(t, ParsePath("/0/2").segments, ShouldEqual, []PathSegment{{s: "0"}, {s: "2"}})
+		Wish(t, ParsePath("/0/2").segments, ShouldEqual, []PathSegment{{s: "0", i: -1}, {s: "2", i: -1}})
 	})
 	t.Run("eliding trailing", func(t *testing.T) {
-		Wish(t, ParsePath("0/2/").segments, ShouldEqual, []PathSegment{{s: "0"}, {s: "2"}})
+		Wish(t, ParsePath("0/2/").segments, ShouldEqual, []PathSegment{{s: "0", i: -1}, {s: "2", i: -1}})
+	})
+	t.Run("eliding empty segments", func(t *testing.T) { // NOTE: a spec for string encoding might cause this to change in the future!
+		Wish(t, ParsePath("0//2").segments, ShouldEqual, []PathSegment{{s: "0", i: -1}, {s: "2", i: -1}})
+	})
+	t.Run("escaping segments", func(t *testing.T) { // NOTE: a spec for string encoding might cause this to change in the future!
+		Wish(t, ParsePath(`0/\//2`).segments, ShouldEqual, []PathSegment{{s: "0", i: -1}, {s: `\`, i: -1}, {s: "2", i: -1}})
 	})
 }
+
+func TestPathSegmentZeroValue(t *testing.T) {
+	Wish(t, PathSegment{}.String(), ShouldEqual, "0")
+	i, err := PathSegment{}.Index()
+	Wish(t, err, ShouldEqual, nil)
+	Wish(t, i, ShouldEqual, 0)
+}