Unverified Commit 624fae00 authored by Eric Myhre's avatar Eric Myhre Committed by GitHub

Merge pull request #101 from ipld/codectools-tokenizers

Fresh take on codec APIs, and some tokenization utilities.
parents 35ad3e37 1110155d
package codec
import (
"io"
"github.com/ipld/go-ipld-prime"
)
// Encoder is the essential definition of a function that takes IPLD Data Model data in memory and serializes it.
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Decoder).
//
// Encoder functions can be composed into an ipld.LinkSystem to provide
// a "one stop shop" API for handling content addressable storage.
// Encoder functions can also be used directly if you want to handle serial data streams.
//
// Most codec packages will have a ReusableEncoder type
// (which contains any working memory needed by the encoder implementation,
// as well as any configuration options),
// and that type will have an Encode function matching this interface.
//
// By convention, codec packages that have a multicodec contract will also have
// a package-scope exported function called Encode which also matches this interface,
// and is the equivalent of creating a zero-value ReusableEncoder (aka, default config)
// and using its Encode method.
// This package-scope function will typically also internally use a sync.Pool
// to keep some ReusableEncoder values on hand to avoid unnecesary allocations.
//
// Note that a ReusableEncoder type that supports configuration options
// does not functionally expose those options when invoked by the multicodec system --
// multicodec indicators do not provide room for extended configuration info.
// Codecs that expose configuration options are doing so for library users to enjoy;
// it does not mean those non-default configurations will necessarly be available
// in all scenarios that use codecs indirectly.
// There is also no standard interface for such configurations: by nature,
// if they exist at all, they vary per codec.
type Encoder func(data ipld.Node, output io.Writer) error
// Decoder is the essential definiton of a function that consumes serial data and unfurls it into IPLD Data Model-compatible in-memory representations.
// IPLD Codecs are written by implementing this function interface (as well as (typically) a matched Encoder).
//
// Decoder is the dual of Encoder.
// Most of the documentation for the Encoder function interface
// also applies wholesale to the Decoder interface.
type Decoder func(into ipld.NodeAssembler, input io.Reader) error
type ErrBudgetExhausted struct{}
func (e ErrBudgetExhausted) Error() string {
return "decoder resource budget exhausted (message too long or too complex)"
}
package codectools
import (
"fmt"
"github.com/ipld/go-ipld-prime"
)
type Token struct {
Kind TokenKind
Length int // Present for MapOpen or ListOpen. May be -1 for "unknown" (e.g. a json tokenizer will yield this).
Bool bool // Value. Union: only has meaning if Kind is TokenKind_Bool.
Int int64 // Value. Union: only has meaning if Kind is TokenKind_Int.
Float float64 // Value. Union: only has meaning if Kind is TokenKind_Float.
Str string // Value. Union: only has meaning if Kind is TokenKind_String. ('Str' rather than 'String' to avoid collision with method.)
Bytes []byte // Value. Union: only has meaning if Kind is TokenKind_Bytes.
Link ipld.Link // Value. Union: only has meaning if Kind is TokenKind_Link.
Node ipld.Node // Direct pointer to the original data, if this token is used to communicate data during a walk of existing in-memory data. Absent when token is being used during deserialization.
// The following fields all track position and progress:
// (These may be useful to copy into any error messages if errors arise.)
// (Implementations may assume token reuse and treat these as state keeping;
// you may experience position accounting accuracy problems if *not* reusing tokens or if zeroing these fields.)
pth []ipld.PathSegment // Set by token producers (whether marshallers or deserializers) to track logical position.
offset int64 // Set by deserializers (for both textual or binary formats alike) to track progress.
lineOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers.
columnOffset int64 // Set by deserializers that work with textual data. May be ignored by binary deserializers.
}
func (tk Token) String() string {
switch tk.Kind {
case TokenKind_MapOpen:
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
case TokenKind_MapClose:
return fmt.Sprintf("<%c>", tk.Kind)
case TokenKind_ListOpen:
return fmt.Sprintf("<%c:%d>", tk.Kind, tk.Length)
case TokenKind_ListClose:
return fmt.Sprintf("<%c>", tk.Kind)
case TokenKind_Null:
return fmt.Sprintf("<%c>", tk.Kind)
case TokenKind_Bool:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Bool)
case TokenKind_Int:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Int)
case TokenKind_Float:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Float)
case TokenKind_String:
return fmt.Sprintf("<%c:%q>", tk.Kind, tk.Str)
case TokenKind_Bytes:
return fmt.Sprintf("<%c:%x>", tk.Kind, tk.Bytes)
case TokenKind_Link:
return fmt.Sprintf("<%c:%v>", tk.Kind, tk.Link)
default:
return "<INVALID>"
}
}
type TokenKind uint8
const (
TokenKind_MapOpen TokenKind = '{'
TokenKind_MapClose TokenKind = '}'
TokenKind_ListOpen TokenKind = '['
TokenKind_ListClose TokenKind = ']'
TokenKind_Null TokenKind = '0'
TokenKind_Bool TokenKind = 'b'
TokenKind_Int TokenKind = 'i'
TokenKind_Float TokenKind = 'f'
TokenKind_String TokenKind = 's'
TokenKind_Bytes TokenKind = 'x'
TokenKind_Link TokenKind = '/'
)
type ErrMalformedTokenSequence struct {
Detail string
}
func (e ErrMalformedTokenSequence) Error() string {
return "malformed token sequence: " + e.Detail
}
package codectools
import (
"fmt"
"io"
"github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/codec"
)
// TokenAssemble takes an ipld.NodeAssembler and a TokenReader,
// and repeatedly pumps the TokenReader for tokens and feeds their data into the ipld.NodeAssembler
// until it finishes a complete value.
//
// To compare and contrast to other token oriented tools:
// TokenAssemble does the same direction of information transfer as the TokenAssembler gadget does,
// but TokenAssemble moves completely through a value in one step,
// whereas the TokenAssembler accepts tokens pumped into it one step at a time.
//
// TokenAssemble does not enforce the "map keys must be strings" rule which is present in the Data Model;
// it will also happily do even recursive structures in map keys,
// meaning it can be used when handling schema values like maps with complex keys.
func TokenAssemble(na ipld.NodeAssembler, tr TokenReader, budget int) error {
tk, err := tr(&budget)
if err != nil {
return err
}
return tokenAssemble(na, tk, tr, &budget)
}
func tokenAssemble(na ipld.NodeAssembler, tk *Token, tr TokenReader, budget *int) error {
if *budget < 0 {
return codec.ErrBudgetExhausted{}
}
switch tk.Kind {
case TokenKind_MapOpen:
if tk.Length > 0 && *budget < tk.Length*2 { // Pre-check budget: at least two decrements estimated for each entry.
return codec.ErrBudgetExhausted{}
}
ma, err := na.BeginMap(tk.Length)
if err != nil {
return err
}
for {
// Peek one token. We need to see if the map is about to end or not.
tk, err = tr(budget)
if err != nil {
return err
}
// If the map has ended, invoke the finish operation and check for any errors.
if tk.Kind == TokenKind_MapClose {
return ma.Finish()
}
// Recurse to assemble the key.
*budget-- // Decrement budget by at least one for each key. The key content may also cause further decrements.
if err = tokenAssemble(ma.AssembleKey(), tk, tr, budget); err != nil {
return err
}
// Recurse to assemble the value.
// (We don't really care to peek this token, but do so anyway to keep the calling convention regular.)
tk, err = tr(budget)
if err != nil {
return err
}
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements.
if err = tokenAssemble(ma.AssembleValue(), tk, tr, budget); err != nil {
return err
}
// Continue around the loop, to encounter either the next entry or the end of the map.
}
case TokenKind_MapClose:
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
case TokenKind_ListOpen:
if tk.Length > 0 && *budget < tk.Length { // Pre-check budget: at least one decrement estimated for each entry.
return codec.ErrBudgetExhausted{}
}
la, err := na.BeginList(tk.Length)
if err != nil {
return err
}
for {
// Peek one token. We need to see if the list is about to end or not.
tk, err = tr(budget)
if err != nil {
return err
}
// If the list has ended, invoke the finish operation and check for any errors.
if tk.Kind == TokenKind_ListClose {
return la.Finish()
}
// Recurse to assemble the value.
*budget-- // Decrement budget by at least one for each value. The value content may also cause further decrements.
if err = tokenAssemble(la.AssembleValue(), tk, tr, budget); err != nil {
return err
}
// Continue around the loop, to encounter either the next value or the end of the list.
}
case TokenKind_ListClose:
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
case TokenKind_Null:
return na.AssignNull()
case TokenKind_Bool:
*budget--
return na.AssignBool(tk.Bool)
case TokenKind_Int:
*budget--
return na.AssignInt(int(tk.Int))
case TokenKind_Float:
*budget--
return na.AssignFloat(tk.Float)
case TokenKind_String:
*budget -= len(tk.Str)
return na.AssignString(tk.Str)
case TokenKind_Bytes:
*budget -= len(tk.Bytes)
return na.AssignBytes(tk.Bytes)
case TokenKind_Link:
*budget--
return na.AssignLink(tk.Link)
default:
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
}
}
// --- the stepwise assembler system (more complicated; has a userland stack) is below -->
type TokenAssembler struct {
// This structure is designed to be embeddable. Use Initialize when doing so.
stk assemblerStack // this is going to end up being a stack you know
budget int64
}
type assemblerStackRow struct {
state uint8 // 0: assign this node; 1: continue list; 2: continue map with key; 3: continue map with value.
na ipld.NodeAssembler // Always present.
la ipld.ListAssembler // At most one of these is present.
ma ipld.MapAssembler // At most one of these is present.
}
type assemblerStack []assemblerStackRow
func (stk assemblerStack) Tip() *assemblerStackRow {
return &stk[len(stk)-1]
}
func (stk *assemblerStack) Push(na ipld.NodeAssembler) {
*stk = append(*stk, assemblerStackRow{na: na})
}
func (stk *assemblerStack) Pop() {
if len(*stk) == 0 {
return
}
*stk = (*stk)[0 : len(*stk)-1]
}
func (ta *TokenAssembler) Initialize(na ipld.NodeAssembler, budget int64) {
if ta.stk == nil {
ta.stk = make(assemblerStack, 0, 10)
} else {
ta.stk = ta.stk[0:0]
}
ta.stk.Push(na)
ta.budget = budget
}
// Process takes a Token pointer as an argument.
// (Notice how this function happens to match the definition of the visitFn that's usable as an argument to TokenWalk.)
// The token argument can be understood to be "borrowed" for the duration of the Process call, but will not be mutated.
// The use of a pointer here is so that a single Token can be reused by multiple calls, avoiding unnecessary allocations.
//
// Note that Process does very little sanity checking of token sequences itself,
// mostly handing information to the NodeAssemblers directly,
// which presumably will reject the data if it is out of line.
// The NodeAssembler this TokenAssembler is wrapping should already be enforcing the relevant logical rules,
// so it is not useful for TokenAssembler.Process to attempt to duplicate those checks;
// TokenAssembler.Process will also return any errors from the NodeAssembler without attempting to enforce a pattern on those errors.
// In particular, TokenAssembler.Process does not check if every MapOpen is paired with a MapClose;
// it does not check if every ListOpen is paired with a ListClose;
// and it does not check if the token stream is continuing after all open recursives have been closed.
// TODO: review this documentation; more of these checks turn out necessary anyway than originally expected.
func (ta *TokenAssembler) Process(tk *Token) (err error) {
if len(ta.stk) == 0 {
return io.EOF
}
tip := ta.stk.Tip()
switch tip.state {
case 0:
switch tk.Kind {
case TokenKind_MapOpen:
tip.ma, err = tip.na.BeginMap(tk.Length)
tip.state = 2
return err
case TokenKind_MapClose:
// Mostly we try to just forward things, but can't not check this one: tip.ma would be nil; there's reasonable target for forwarding.
return ErrMalformedTokenSequence{"map close token encountered while not in the middle of a map"}
case TokenKind_ListOpen:
tip.la, err = tip.na.BeginList(tk.Length)
tip.state = 1
return err
case TokenKind_ListClose:
// Mostly we try to just forward things, but can't not check this one: tip.la would be nil; there's reasonable target for forwarding.
return ErrMalformedTokenSequence{"list close token encountered while not in the middle of a list"}
case TokenKind_Null:
err = tip.na.AssignNull()
ta.stk.Pop()
return err
case TokenKind_Bool:
err = tip.na.AssignBool(tk.Bool)
ta.stk.Pop()
return err
case TokenKind_Int:
err = tip.na.AssignInt(int(tk.Int)) // TODO: upgrade all of ipld to use high precision int consistently
ta.stk.Pop()
return err
case TokenKind_Float:
err = tip.na.AssignFloat(tk.Float)
ta.stk.Pop()
return err
case TokenKind_String:
err = tip.na.AssignString(tk.Str)
ta.stk.Pop()
return err
case TokenKind_Bytes:
err = tip.na.AssignBytes(tk.Bytes)
ta.stk.Pop()
return err
case TokenKind_Link:
err = tip.na.AssignLink(tk.Link)
ta.stk.Pop()
return err
default:
panic(fmt.Errorf("unrecognized token kind (%q?)", tk.Kind))
}
return nil
case 1:
if tk.Kind == TokenKind_ListClose {
err = tip.la.Finish()
ta.stk.Pop()
return err
}
ta.stk.Push(tip.la.AssembleValue())
return ta.Process(tk)
case 2:
if tk.Kind == TokenKind_MapClose {
err = tip.ma.Finish()
ta.stk.Pop()
return err
}
tip.state = 3
ta.stk.Push(tip.ma.AssembleKey())
return ta.Process(tk)
case 3:
tip.state = 2
ta.stk.Push(tip.ma.AssembleValue())
return ta.Process(tk)
default:
panic("unreachable")
}
}
package codectools
import (
"io"
"testing"
. "github.com/warpfork/go-wish"
)
func TestTokenAssemble(t *testing.T) {
for _, tcase := range tokenFixtures {
nb := tcase.value.Prototype().NewBuilder()
var readerOffset int
err := TokenAssemble(nb, func(budget *int) (*Token, error) {
if readerOffset > len(tcase.sequence) {
return nil, io.EOF
}
readerOffset++
return &tcase.sequence[readerOffset-1], nil
}, 1<<10)
if err != nil {
t.Error(err)
}
Wish(t, nb.Build(), ShouldEqual, tcase.value)
}
}
func TestTokenAssembler(t *testing.T) {
for _, tcase := range tokenFixtures {
nb := tcase.value.Prototype().NewBuilder()
var ta TokenAssembler
ta.Initialize(nb, 1<<10)
for _, tk := range tcase.sequence {
err := ta.Process(&tk)
Wish(t, err, ShouldEqual, nil)
}
Wish(t, nb.Build(), ShouldEqual, tcase.value)
}
}
package codectools
import (
"errors"
"fmt"
"io"
"github.com/ipld/go-ipld-prime"
)
// TokenWalk walks an ipld Node and repeatedly calls the visitFn,
// calling it once for every "token" yielded by the walk.
// Every map and list is yielded as a token at their beginning,
// and another token when they're finished;
// every scalar value (strings, bools, bytes, ints, etc) is yielded as a single token.
//
// The token pointer given to the visitFn will be identical on every call,
// but the data it contains will vary.
// The token may contain invalid data that is leftover from previous calls
// in some of its union fields; correct behavior requires looking at the
// token's Kind field before handling any of its other fields.
//
// If any error is returned by the visitFn, it will cause the walk to halt,
// and TokenWalk will return that error.
// However, if the error is the value TokenWalkSkip, and it's been returned
// when visitFn was called with a MapOpen or ListOpen token, the walk will
// skip forward over that entire map or list, and continue (with the
// next token being the close token that complements the open token).
// Returning a TokenWalkSkip when the token was any of the scalar kinds
// (e.g. anything other than a MapOpen or a ListOpen) has no effect.
//
// TokenAssembler is the rough dual of TokenWalk.
func TokenWalk(n ipld.Node, visitFn func(tk *Token) error) error {
// TokenWalk would be trivial to implement over NodeTokenizer,
// but we do a distinct implementation here because NodeTokenizer's resumable implementation means it needs a user-space stack,
// and to reuse that would require allocations which this method (since it's not resumable in the same way) can easily avoid (or at least, keep on the stack).
var tk Token // For capture, once.
return tokenWalk(&tk, n, visitFn)
}
func tokenWalk(tk *Token, n ipld.Node, visitFn func(*Token) error) error {
switch n.ReprKind() {
case ipld.ReprKind_Map:
tk.Kind = TokenKind_MapOpen
tk.Length = n.Length()
tk.Node = n
if err := visitFn(tk); err != nil {
return err
}
mitr := n.MapIterator()
for !mitr.Done() {
k, v, err := mitr.Next()
if err != nil {
return err
}
if err := tokenWalk(tk, k, visitFn); err != nil {
return err
}
if err := tokenWalk(tk, v, visitFn); err != nil {
return err
}
}
tk.Kind = TokenKind_MapClose
tk.Node = n
return visitFn(tk)
case ipld.ReprKind_List:
tk.Kind = TokenKind_ListOpen
tk.Length = n.Length()
tk.Node = n
if err := visitFn(tk); err != nil {
return err
}
litr := n.ListIterator()
for !litr.Done() {
_, v, err := litr.Next()
if err != nil {
return err
}
if err := tokenWalk(tk, v, visitFn); err != nil {
return err
}
}
tk.Kind = TokenKind_ListClose
tk.Node = n
return visitFn(tk)
case ipld.ReprKind_Null:
tk.Kind = TokenKind_Null
return visitFn(tk)
case ipld.ReprKind_Bool:
tk.Kind = TokenKind_Bool
tk.Bool, _ = n.AsBool()
return visitFn(tk)
case ipld.ReprKind_Int:
tk.Kind = TokenKind_Int
i, _ := n.AsInt()
tk.Int = int64(i) // TODO: upgrade all of ipld to use high precision int consistently
return visitFn(tk)
case ipld.ReprKind_Float:
tk.Kind = TokenKind_Float
tk.Float, _ = n.AsFloat()
return visitFn(tk)
case ipld.ReprKind_String:
tk.Kind = TokenKind_String
tk.Str, _ = n.AsString()
return visitFn(tk)
case ipld.ReprKind_Bytes:
tk.Kind = TokenKind_Bytes
tk.Bytes, _ = n.AsBytes()
return visitFn(tk)
case ipld.ReprKind_Link:
tk.Kind = TokenKind_Link
tk.Link, _ = n.AsLink()
return visitFn(tk)
default:
panic(fmt.Errorf("unrecognized node kind (%q?)", n.ReprKind()))
}
return nil
}
var TokenWalkSkip = errors.New("token walk: skip")
// --- the stepwise token producer system (more complicated; has a userland stack) is below -->
// A TokenReader can be produced from any ipld.Node using NodeTokenizer.
// TokenReader are also commonly implemented by codec packages,
// wherein they're created over a serial data stream and tokenize that stream when pumped.
//
// TokenReader implementations are encouraged to yield the same token pointer repeatedly,
// just varying the contents of the value, in order to avoid unnecessary allocations.
//
// A 'budget' parameter must be provided to a TokenReader as a pointer to an integer.
// The TokenReader should limit how much memory it uses according to the budget remaining.
// (The budget is considered to be roughly in units of bytes, but can be treated as an approximation.)
// The budget should primarily be managed by the caller of the TokenReader
// (e.g., after the TokenReader returns a 20 byte string, the caller should decrement the budget by 20),
// but a TokenReader may also do its own decrements to the budget if some operations are particularly costly and the TokenReader wants this to be accounted for.
// The budget may be ignored if the TokenReader just yielding access to already in-memory information;
// the main intent of the budget is to avoid resource exhausting when bringing new data into program memory.
//
type TokenReader func(budget *int) (next *Token, err error)
type NodeTokenizer struct {
// This structure is designed to be embeddable. Use Initialize when doing so.
tk Token // We embed this to avoid allocations; we'll be repeatedly yielding a pointer to this piece of memory.
stk nodeTokenizerStack
}
func (nt *NodeTokenizer) Initialize(n ipld.Node) {
if nt.stk == nil {
nt.stk = make(nodeTokenizerStack, 0, 10)
} else {
nt.stk = nt.stk[0:0]
}
nt.stk.Push(n)
}
type nodeTokenizerStackRow struct {
state uint8 // 0: start this node; 1: continue list; 2: continue map with key; 3: continue map with value.
n ipld.Node // Always present.
litr ipld.ListIterator // At most one of these is present.
mitr ipld.MapIterator // At most one of these is present.
mval ipld.Node // The value to resume at when in state 3.
}
type nodeTokenizerStack []nodeTokenizerStackRow
func (stk nodeTokenizerStack) Tip() *nodeTokenizerStackRow {
return &stk[len(stk)-1]
}
func (stk *nodeTokenizerStack) Push(n ipld.Node) {
*stk = append(*stk, nodeTokenizerStackRow{n: n})
}
func (stk *nodeTokenizerStack) Pop() {
if len(*stk) == 0 {
return
}
*stk = (*stk)[0 : len(*stk)-1]
}
// ReadToken fits the TokenReader functional interface, and so may be used anywhere a TokenReader is required.
func (nt *NodeTokenizer) ReadToken() (next *Token, err error) {
// How stack depth works:
// - finding that you're starting to handle map or least leaves it the same;
// - before recursing to handle a child key or value, push stack;
// - any time you finish something, whether scalar or recursive, pop stack.
// This could be written differently: in particular,
// scalar leaves could be handled without increasing stack depth by that last increment.
// However, doing so would make for more complicated code.
// Maybe worth it; PRs welcome; benchmarks first.
if len(nt.stk) == 0 {
return nil, io.EOF
}
tip := nt.stk.Tip()
switch tip.state {
case 0:
switch tip.n.ReprKind() {
case ipld.ReprKind_Map:
nt.tk.Kind = TokenKind_MapOpen
nt.tk.Length = tip.n.Length()
nt.tk.Node = tip.n
tip.state = 2
tip.mitr = tip.n.MapIterator()
return &nt.tk, nil
case ipld.ReprKind_List:
nt.tk.Kind = TokenKind_ListOpen
nt.tk.Length = tip.n.Length()
nt.tk.Node = tip.n
tip.state = 1
tip.litr = tip.n.ListIterator()
return &nt.tk, nil
case ipld.ReprKind_Null:
nt.tk.Kind = TokenKind_Null
nt.stk.Pop()
return &nt.tk, nil
case ipld.ReprKind_Bool:
nt.tk.Kind = TokenKind_Bool
nt.tk.Bool, _ = tip.n.AsBool()
nt.stk.Pop()
return &nt.tk, nil
case ipld.ReprKind_Int:
nt.tk.Kind = TokenKind_Int
i, _ := tip.n.AsInt()
nt.tk.Int = int64(i) // TODO: upgrade all of ipld to use high precision int consistently
nt.stk.Pop()
return &nt.tk, nil
case ipld.ReprKind_Float:
nt.tk.Kind = TokenKind_Float
nt.tk.Float, _ = tip.n.AsFloat()
nt.stk.Pop()
return &nt.tk, nil
case ipld.ReprKind_String:
nt.tk.Kind = TokenKind_String
nt.tk.Str, _ = tip.n.AsString()
nt.stk.Pop()
return &nt.tk, nil
case ipld.ReprKind_Bytes:
nt.tk.Kind = TokenKind_Bytes
nt.tk.Bytes, _ = tip.n.AsBytes()
nt.stk.Pop()
return &nt.tk, nil
case ipld.ReprKind_Link:
nt.tk.Kind = TokenKind_Link
nt.tk.Link, _ = tip.n.AsLink()
nt.stk.Pop()
return &nt.tk, nil
default:
panic(fmt.Errorf("unrecognized node kind (%q?)", tip.n.ReprKind()))
}
case 1:
if tip.litr.Done() {
nt.tk.Kind = TokenKind_ListClose
nt.tk.Node = tip.n
nt.stk.Pop()
return &nt.tk, nil
}
_, v, err := tip.litr.Next()
if err != nil {
return nil, err
}
nt.stk.Push(v)
return nt.ReadToken()
case 2:
if tip.mitr.Done() {
nt.tk.Kind = TokenKind_MapClose
nt.tk.Node = tip.n
nt.stk.Pop()
return &nt.tk, nil
}
k, v, err := tip.mitr.Next()
if err != nil {
return nil, err
}
tip.mval = v
tip.state = 3
nt.stk.Push(k)
return nt.ReadToken()
case 3:
tip.state = 2
nt.stk.Push(tip.mval)
return nt.ReadToken()
default:
panic("unreachable")
}
}
package codectools
import (
"io"
"testing"
. "github.com/warpfork/go-wish"
)
func TestTokenWalk(t *testing.T) {
for _, tcase := range tokenFixtures {
var result []Token
err := TokenWalk(tcase.value, func(tk *Token) error {
result = append(result, *tk)
return nil
})
if err != nil {
t.Error(err)
}
Wish(t, StringifyTokenSequence(result), ShouldEqual, StringifyTokenSequence(tcase.sequence))
}
}
func TestNodeTokenizer(t *testing.T) {
for _, tcase := range tokenFixtures {
var nt NodeTokenizer
var result []Token
nt.Initialize(tcase.value)
for {
tk, err := nt.ReadToken()
if err == nil {
result = append(result, *tk)
} else if err == io.EOF {
break
} else {
t.Error(err)
break
}
}
Wish(t, StringifyTokenSequence(result), ShouldEqual, StringifyTokenSequence(tcase.sequence))
}
}
package codectools
import (
"github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/fluent"
"github.com/ipld/go-ipld-prime/must"
basicnode "github.com/ipld/go-ipld-prime/node/basic"
)
var tokenFixtures = []struct {
value ipld.Node
sequence []Token
}{
{
value: must.Node(fluent.Reflect(basicnode.Prototype.Any,
"a scalar",
)),
sequence: []Token{
{Kind: TokenKind_String, Str: "a scalar"},
},
},
{
value: must.Node(fluent.Reflect(basicnode.Prototype.Any,
map[string]interface{}{
"a": "b",
"c": "d",
},
)),
sequence: []Token{
{Kind: TokenKind_MapOpen, Length: 2},
/**/ {Kind: TokenKind_String, Str: "a"}, {Kind: TokenKind_String, Str: "b"},
/**/ {Kind: TokenKind_String, Str: "c"}, {Kind: TokenKind_String, Str: "d"},
{Kind: TokenKind_MapClose},
},
},
{
value: must.Node(fluent.Reflect(basicnode.Prototype.Any,
map[string]interface{}{
"a": 1,
"b": map[string]interface{}{
"c": "d",
},
},
)),
sequence: []Token{
{Kind: TokenKind_MapOpen, Length: 2},
/**/ {Kind: TokenKind_String, Str: "a"}, {Kind: TokenKind_Int, Int: 1},
/**/ {Kind: TokenKind_String, Str: "b"}, {Kind: TokenKind_MapOpen, Length: 1},
/**/ /**/ {Kind: TokenKind_String, Str: "c"}, {Kind: TokenKind_String, Str: "d"},
/**/ {Kind: TokenKind_MapClose},
{Kind: TokenKind_MapClose},
},
},
{
value: must.Node(fluent.Reflect(basicnode.Prototype.Any,
[]interface{}{
"a",
"b",
"c",
},
)),
sequence: []Token{
{Kind: TokenKind_ListOpen, Length: 3},
/**/ {Kind: TokenKind_String, Str: "a"},
/**/ {Kind: TokenKind_String, Str: "b"},
/**/ {Kind: TokenKind_String, Str: "c"},
{Kind: TokenKind_ListClose},
},
},
}
package codectools
import (
"strings"
)
// Normalize sets any value in the token to its zero value if it's not applicable for the token's kind.
// E.g., if the token kind is string, the float, bytes, and etc fields are all zero'd.
// Path and offset progress information is left unmodified.
// This is sometimes helpful in writing test fixtures and equality assertions.
func (tk *Token) Normalize() {
if tk.Kind != TokenKind_MapOpen && tk.Kind != TokenKind_ListOpen {
tk.Length = 0
}
if tk.Kind != TokenKind_Bool {
tk.Bool = false
}
if tk.Kind != TokenKind_Int {
tk.Int = 0
}
if tk.Kind != TokenKind_Float {
tk.Float = 0
}
if tk.Kind != TokenKind_String {
tk.Str = ""
}
if tk.Kind != TokenKind_Bytes {
tk.Bytes = nil
}
if tk.Kind != TokenKind_Link {
tk.Link = nil
}
}
// StringifyTokenSequence is utility function often handy for testing.
// (Doing a diff on strings of tokens gives very good reports for minimal effort.)
func StringifyTokenSequence(seq []Token) string {
var sb strings.Builder
for _, tk := range seq {
sb.WriteString(tk.String())
sb.WriteByte('\n')
}
return sb.String()
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment