Commit d3ddbfee authored by Eric Myhre's avatar Eric Myhre

Port codecs; add benchmarks.

Porting codecs to the new NodeAssembler interfaces was straightforward.

The new codecs exist in the nodesolution "research" dirs for now,
coexisting with the soon-to-be-legacy encoding package.
This means we can see benchmarks of both the old and new designs within
this commit.  (We'll probably give up on this shortly -- when dealing
with the traversal package too, it's gonna stop being reasonable -- but
for now it's still possible and provides interesting information.)

And how *is* that performance, you ask?

Peachy.

Ballpark answers for marshalling:

- 1079ns/op for the new Node
- 1435ns/op for the old Node
- 1559ns/op for stdlib json marshal of a native map.

144% better than the operations of stdlib json is pretty acceptable.
(Will more intense codegen beat that?  Oh for sure.  But this is
*without any codegen*, so this is quite satisfactory.)

Note that much of that time left is probably dominated by
serialization-related allocations rather than the node traversal.
I didn't dive into the pprofs to verify that yet, though.
This picture of the overall act of marshalling is nice to have
since it's a practical end-to-end user story.

This test is also on a very small piece of data, and I expect
the improvements will be further much bigger on larger or
deeper-recursing structures.

And lest this be skimmed over: the excellence of doing better than
stdlib's json **while having pluginable codecs** cannot be understated.

Pretty happy with this.

How's unmarshal?  Eh.  About the same as before.  Remember, we chose
*not* to do a lot of amortizations in the new 'basicnode'
implementations, because although we *could* (and it's quite clear how
to do so), the increase in memory size we'd face since go doesn't allow
unions was deemed too large of a constant factor multiplier.
We *will* see these improvements in codegen, and we can also make
variants of 'basicnode' that do these amortizations in the future.

Doing a lot of thinking about how benchmarks and tests will be managed
as they continue to grow in count and in variation of semantic targets.
Might have to write some tooling around it.  We'll see.
parent 530ccd68
package codec
import (
"fmt"
"github.com/polydawn/refmt/shared"
"github.com/polydawn/refmt/tok"
ipld "github.com/ipld/go-ipld-prime/_rsrch/nodesolution"
)
// FUTURE there are very open questions on how to handle detection and special-track'ing for advLayout nodes when we get to that feature.
// Marshal provides a very general node-to-tokens marshalling feature.
// It can handle either cbor or json by being combined with a refmt TokenSink.
//
// It is valid for all the data model types except links, which are only
// supported if the nodes are typed and provide additional information
// to clarify how links should be encoded through their type info.
// (The dag-cbor and dag-json formats can be used if links are of CID
// implementation and need to be encoded in a schemafree way.)
func Marshal(n ipld.Node, sink shared.TokenSink) error {
var tk tok.Token
switch n.ReprKind() {
case ipld.ReprKind_Invalid:
return fmt.Errorf("cannot traverse a node that is undefined")
case ipld.ReprKind_Null:
tk.Type = tok.TNull
_, err := sink.Step(&tk)
return err
case ipld.ReprKind_Map:
// Emit start of map.
tk.Type = tok.TMapOpen
tk.Length = n.Length()
if _, err := sink.Step(&tk); err != nil {
return err
}
// Emit map contents (and recurse).
for itr := n.MapIterator(); !itr.Done(); {
k, v, err := itr.Next()
if err != nil {
return err
}
tk.Type = tok.TString
tk.Str, err = k.AsString()
if err != nil {
return err
}
if _, err := sink.Step(&tk); err != nil {
return err
}
if err := Marshal(v, sink); err != nil {
return err
}
}
// Emit map close.
tk.Type = tok.TMapClose
_, err := sink.Step(&tk)
return err
case ipld.ReprKind_List:
// Emit start of list.
tk.Type = tok.TArrOpen
l := n.Length()
tk.Length = l
if _, err := sink.Step(&tk); err != nil {
return err
}
// Emit list contents (and recurse).
for i := 0; i < l; i++ {
v, err := n.LookupIndex(i)
if err != nil {
return err
}
if err := Marshal(v, sink); err != nil {
return err
}
}
// Emit list close.
tk.Type = tok.TArrClose
_, err := sink.Step(&tk)
return err
case ipld.ReprKind_Bool:
v, err := n.AsBool()
if err != nil {
return err
}
tk.Type = tok.TBool
tk.Bool = v
_, err = sink.Step(&tk)
return err
case ipld.ReprKind_Int:
v, err := n.AsInt()
if err != nil {
return err
}
tk.Type = tok.TInt
tk.Int = int64(v)
_, err = sink.Step(&tk)
return err
case ipld.ReprKind_Float:
v, err := n.AsFloat()
if err != nil {
return err
}
tk.Type = tok.TFloat64
tk.Float64 = v
_, err = sink.Step(&tk)
return err
case ipld.ReprKind_String:
v, err := n.AsString()
if err != nil {
return err
}
tk.Type = tok.TString
tk.Str = v
_, err = sink.Step(&tk)
return err
case ipld.ReprKind_Bytes:
v, err := n.AsBytes()
if err != nil {
return err
}
tk.Type = tok.TBytes
tk.Bytes = v
_, err = sink.Step(&tk)
return err
case ipld.ReprKind_Link:
return fmt.Errorf("link emission not supported by this codec without a schema! (maybe you want dag-cbor or dag-json)")
default:
panic("unreachable")
}
}
package codec
import (
"fmt"
"math"
"github.com/polydawn/refmt/shared"
"github.com/polydawn/refmt/tok"
ipld "github.com/ipld/go-ipld-prime/_rsrch/nodesolution"
)
// wishlist: if we could reconstruct the ipld.Path of an error while
// *unwinding* from that error... that'd be nice.
// (trying to build it proactively would waste tons of allocs on the happy path.)
// we can do this; it just requires well-typed errors and a bunch of work.
// Tests for all this are in the ipld.Node impl tests!
// They're effectively doing double duty: testing the builders, too.
// (Is that sensible? Should it be refactored? Not sure; maybe!)
// Unmarshal provides a very general tokens-to-node unmarshalling feature.
// It can handle either cbor or json by being combined with a refmt TokenSink.
//
// The unmarshalled data is fed to the given NodeAssembler, which accumulates it;
// at the end, any error is returned from the Unmarshal method,
// and the user can pick up the finished Node from wherever their assembler has it.
// Typical usage might look like the following:
//
// nb := basicnode.Style__Any{}.NewBuilder()
// err := codec.Unmarshal(nb, json.Decoder(reader))
// n := nb.Build()
//
// It is valid for all the data model types except links, which are only
// supported if the nodes are typed and provide additional information
// to clarify how links should be decoded through their type info.
// (The dag-cbor and dag-json formats can be used if links are of CID
// implementation and need to be decoded in a schemafree way.)
func Unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource) error {
var tk tok.Token
done, err := tokSrc.Step(&tk)
if err != nil {
return err
}
if done && !tk.Type.IsValue() {
panic("unexpected eof") // FIXME this is really awkward, are we sure this can't be avoided? and are we sure done is handled right elsewhere too?
}
return unmarshal(na, tokSrc, &tk)
}
// TODO, yeah, you need a method that does a better oneliner, and that Unmarshal thing above needs a longer name to correspond to its windiness.
// starts with the first token already primed. Necessary to get recursion
// to flow right without a peek+unpeek system.
func unmarshal(na ipld.NodeAssembler, tokSrc shared.TokenSource, tk *tok.Token) error {
// FUTURE: check for typed.NodeBuilder that's going to parse a Link (they can slurp any token kind they want).
switch tk.Type {
case tok.TMapOpen:
expectLen := tk.Length
allocLen := tk.Length
if tk.Length == -1 {
expectLen = math.MaxInt32
allocLen = 0
}
ma, err := na.BeginMap(allocLen)
if err != nil {
return err
}
observedLen := 0
for {
_, err := tokSrc.Step(tk)
if err != nil {
return err
}
switch tk.Type {
case tok.TMapClose:
if expectLen != math.MaxInt32 && observedLen != expectLen {
return fmt.Errorf("unexpected mapClose before declared length")
}
return ma.Finish()
case tok.TString:
// continue
default:
return fmt.Errorf("unexpected %s token while expecting map key", tk.Type)
}
observedLen++
if observedLen > expectLen {
return fmt.Errorf("unexpected continuation of map elements beyond declared length")
}
mva, err := ma.AssembleDirectly(tk.Str)
if err != nil { // return in error if the key was rejected
return err
}
err = Unmarshal(mva, tokSrc)
if err != nil { // return in error if some part of the recursion errored
return err
}
}
case tok.TMapClose:
return fmt.Errorf("unexpected mapClose token")
case tok.TArrOpen:
expectLen := tk.Length
allocLen := tk.Length
if tk.Length == -1 {
expectLen = math.MaxInt32
allocLen = 0
}
la, err := na.BeginList(allocLen)
if err != nil {
return err
}
observedLen := 0
for {
_, err := tokSrc.Step(tk)
if err != nil {
return err
}
switch tk.Type {
case tok.TArrClose:
if expectLen != math.MaxInt32 && observedLen != expectLen {
return fmt.Errorf("unexpected arrClose before declared length")
}
return la.Finish()
default:
observedLen++
if observedLen > expectLen {
return fmt.Errorf("unexpected continuation of array elements beyond declared length")
}
err := unmarshal(la.AssembleValue(), tokSrc, tk)
if err != nil { // return in error if some part of the recursion errored
return err
}
}
}
case tok.TArrClose:
return fmt.Errorf("unexpected arrClose token")
case tok.TNull:
return na.AssignNull()
case tok.TString:
return na.AssignString(tk.Str)
case tok.TBytes:
return na.AssignBytes(tk.Bytes)
case tok.TBool:
return na.AssignBool(tk.Bool)
case tok.TInt:
return na.AssignInt(int(tk.Int)) // FIXME overflow check
case tok.TUint:
return na.AssignInt(int(tk.Uint)) // FIXME overflow check
case tok.TFloat64:
return na.AssignFloat(tk.Float64)
default:
panic("unreachable")
}
}
......@@ -30,3 +30,12 @@ func BenchmarkMapStrInt_25n_AssembleDirectly(b *testing.B) {
func BenchmarkMapStrInt_25n_Iteration(b *testing.B) {
tests.SpecBenchmarkMapStrInt_25n_Iteration(b, Style__Map{})
}
func BenchmarkUnmarshalMapStrInt_3n(b *testing.B) {
tests.SpecBenchmarkUnmarshalMapStrInt_3n(b, Style__Map{})
}
func BenchmarkMarshalMapStrInt_3n(b *testing.B) {
tests.SpecBenchmarkMarshalMapStrInt_3n(b, Style__Map{})
}
......@@ -28,6 +28,15 @@ func BenchmarkMapStrInt_3n_BaselineJsonUnmarshalMapSimpleKeys(b *testing.B) {
}
}
func BenchmarkMapStrInt_3n_BaselineJsonMarshalMapSimpleKeys(b *testing.B) {
var x = map[string]int{"whee": 1, "woot": 2, "waga": 3}
for i := 0; i < b.N; i++ {
bs, err := json.Marshal(x)
must.NotError(err)
sink = bs
}
}
var sink_s string
var sink_i int
......
package tests
import (
"bytes"
"testing"
refmtjson "github.com/polydawn/refmt/json"
ipld "github.com/ipld/go-ipld-prime/_rsrch/nodesolution"
"github.com/ipld/go-ipld-prime/_rsrch/nodesolution/codec"
"github.com/ipld/go-ipld-prime/must"
)
// All of the marshalling and unmarshalling benchmark specs use JSON.
// This does mean we're measuring a bunch of stuff that has nothing to do
// with the core operations of the Node/NodeBuilder interface.
// We do this so that:
// - we get a reasonable picture of how much time is spent in the IPLD Data Model
// versus how much time is spent in the serialization efforts;
// - we can make direct comparisons to the standard library json marshalling
// and unmarshalling, thus having a back-of-the-envelope baseline to compare.
func SpecBenchmarkMarshalMapStrInt_3n(b *testing.B, ns ipld.NodeStyle) {
nb := ns.NewBuilder()
must.NotError(codec.Unmarshal(nb, refmtjson.NewDecoder(bytes.NewBufferString(`{"whee":1,"woot":2,"waga":3}`))))
n := nb.Build()
b.ResetTimer()
var err error
for i := 0; i < b.N; i++ {
var buf bytes.Buffer
err = codec.Marshal(n, refmtjson.NewEncoder(&buf, refmtjson.EncodeOptions{}))
sink = buf
}
if err != nil {
panic(err)
}
}
package tests
import (
"bytes"
"testing"
refmtjson "github.com/polydawn/refmt/json"
ipld "github.com/ipld/go-ipld-prime/_rsrch/nodesolution"
"github.com/ipld/go-ipld-prime/_rsrch/nodesolution/codec"
)
// All of the marshalling and unmarshalling benchmark specs use JSON.
// This does mean we're measuring a bunch of stuff that has nothing to do
// with the core operations of the Node/NodeBuilder interface.
// We do this so that:
// - we get a reasonable picture of how much time is spent in the IPLD Data Model
// versus how much time is spent in the serialization efforts;
// - we can make direct comparisons to the standard library json marshalling
// and unmarshalling, thus having a back-of-the-envelope baseline to compare.
func SpecBenchmarkUnmarshalMapStrInt_3n(b *testing.B, ns ipld.NodeStyle) {
var err error
for i := 0; i < b.N; i++ {
nb := ns.NewBuilder()
err = codec.Unmarshal(nb, refmtjson.NewDecoder(bytes.NewBufferString(`{"whee":1,"woot":2,"waga":3}`)))
sink = nb.Build()
}
if err != nil {
panic(err)
}
}
......@@ -5,6 +5,7 @@ import (
"testing"
ipld "github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/tests"
)
var sink interface{}
......@@ -83,6 +84,16 @@ func BenchmarkMap25nGenericMapIterationSimpleKeys(b *testing.B) {
}
}
// benchmarks covering encoding -->
func BenchmarkUnmarshalMapStrInt_3n(b *testing.B) {
tests.SpecBenchmarkUnmarshalMapStrInt_3n(b, NodeBuilder())
}
func BenchmarkMarshalMapStrInt_3n(b *testing.B) {
tests.SpecBenchmarkMarshalMapStrInt_3n(b, NodeBuilder())
}
// copy of helper functions from must package, because import cycles, sigh -->
func mustNotError(e error) {
......
package tests
import (
"bytes"
"testing"
refmtjson "github.com/polydawn/refmt/json"
ipld "github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/encoding"
)
// All of the marshalling and unmarshalling benchmark specs use JSON.
// This does mean we're measuring a bunch of stuff that has nothing to do
// with the core operations of the Node/NodeBuilder interface.
// We do this so that:
// - we get a reasonable picture of how much time is spent in the IPLD Data Model
// versus how much time is spent in the serialization efforts;
// - we can make direct comparisons to the standard library json marshalling
// and unmarshalling, thus having a back-of-the-envelope baseline to compare.
func SpecBenchmarkMarshalMapStrInt_3n(b *testing.B, nb ipld.NodeBuilder) {
n, err := encoding.Unmarshal(nb, refmtjson.NewDecoder(bytes.NewBufferString(`{"whee":1,"woot":2,"waga":3}`)))
if err != nil {
panic(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
var buf bytes.Buffer
err = encoding.Marshal(n, refmtjson.NewEncoder(&buf, refmtjson.EncodeOptions{}))
sink = buf
}
if err != nil {
panic(err)
}
}
package tests
import (
"bytes"
"testing"
refmtjson "github.com/polydawn/refmt/json"
ipld "github.com/ipld/go-ipld-prime"
"github.com/ipld/go-ipld-prime/encoding"
)
// All of the marshalling and unmarshalling benchmark specs use JSON.
// This does mean we're measuring a bunch of stuff that has nothing to do
// with the core operations of the Node/NodeBuilder interface.
// We do this so that:
// - we get a reasonable picture of how much time is spent in the IPLD Data Model
// versus how much time is spent in the serialization efforts;
// - we can make direct comparisons to the standard library json marshalling
// and unmarshalling, thus having a back-of-the-envelope baseline to compare.
var sink interface{}
func SpecBenchmarkUnmarshalMapStrInt_3n(b *testing.B, nb ipld.NodeBuilder) {
var err error
for i := 0; i < b.N; i++ {
sink, err = encoding.Unmarshal(nb, refmtjson.NewDecoder(bytes.NewBufferString(`{"whee":1,"woot":2,"waga":3}`)))
}
if err != nil {
panic(err)
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment