diff --git a/v2/bench_test.go b/v2/bench_test.go index 83adc346a90ecb72982d03ae3f509c9676e10276..16ae9337839bb63bc622724017184ce7fe28bac1 100644 --- a/v2/bench_test.go +++ b/v2/bench_test.go @@ -2,12 +2,20 @@ package car_test import ( "io" + "math/rand" "os" + "path/filepath" "testing" + "github.com/ipfs/go-cid" + "github.com/ipfs/go-merkledag" + "github.com/ipld/go-car/v2/blockstore" + carv2 "github.com/ipld/go-car/v2" ) +var rng = rand.New(rand.NewSource(1413)) + // BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks. // It essentially looks at the contents of any CARv1 or CARv2 file. // Note that this also uses internal carv1.ReadHeader underneath. @@ -47,3 +55,93 @@ func BenchmarkReadBlocks(b *testing.B) { } }) } + +// BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File. +func BenchmarkExtractV1File(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + dstPath := filepath.Join(b.TempDir(), "destination.car") + for pb.Next() { + err = carv2.ExtractV1File(path, dstPath) + if err != nil { + b.Fatal(err) + } + _ = os.Remove(dstPath) + } + }) +} + +// BenchmarkExtractV1UsingReader extracts inner CARv1 payload from a sample CARv2 file using Reader +// API. This benchmark is implemented to be used as a comparison in conjunction with +// BenchmarkExtractV1File. +func BenchmarkExtractV1UsingReader(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + dstPath := filepath.Join(b.TempDir(), "destination.car") + for pb.Next() { + dst, err := os.Create(dstPath) + if err != nil { + b.Fatal(err) + } + reader, err := carv2.OpenReader(path) + if err != nil { + b.Fatal(err) + } + _, err = io.Copy(dst, reader.DataReader()) + if err != nil { + b.Fatal(err) + } + if err := dst.Close(); err != nil { + b.Fatal(err) + } + } + }) +} + +func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) { + bs, err := blockstore.OpenReadWrite(path, []cid.Cid{}) + defer func() { + if err := bs.Finalize(); err != nil { + b.Fatal(err) + } + }() + if err != nil { + b.Fatal(err) + } + buf := make([]byte, 1024) + var totalBlockSize int + for totalBlockSize < minTotalBlockSize { + size, err := rng.Read(buf) + if err != nil { + b.Fatal(err) + } + + blk := merkledag.NewRawNode(buf) + if err := bs.Put(blk); err != nil { + b.Fatal(err) + } + totalBlockSize += size + } +} diff --git a/v2/writer.go b/v2/writer.go index 40004648e0f04df609854d3529d48aae0ed4f3d3..91b5340163554e6dc3cf4199e77f81c2b873c082 100644 --- a/v2/writer.go +++ b/v2/writer.go @@ -1,6 +1,8 @@ package car import ( + "errors" + "fmt" "io" "os" @@ -9,6 +11,9 @@ import ( "github.com/ipld/go-car/v2/index" ) +// ErrAlreadyV1 signals that the given payload is already in CARv1 format. +var ErrAlreadyV1 = errors.New("already a CARv1") + // WrapV1File is a wrapper around WrapV1 that takes filesystem paths. // The source path is assumed to exist, and the destination path is overwritten. // Note that the destination path might still be created even if an error @@ -79,6 +84,109 @@ func WrapV1(src io.ReadSeeker, dst io.Writer) error { return nil } +// ExtractV1File takes a CARv2 file and extracts its CARv1 data payload, unmodified. +// The resulting CARv1 file will not include any data payload padding that may be present in the +// CARv2 srcPath. +// If srcPath represents a CARv1 ErrAlreadyV1 error is returned. +// The srcPath is assumed to exist, and the destination path is created if not exist. +// Note that the destination path might still be created even if an error +// occurred. +// If srcPath and dstPath are the same, then the dstPath is converted, in-place, to CARv1. +func ExtractV1File(srcPath, dstPath string) (err error) { + src, err := os.Open(srcPath) + if err != nil { + return err + } + + // Ignore close error since only reading from src. + defer src.Close() + + // Detect CAR version. + version, err := ReadVersion(src) + if err != nil { + return err + } + if version == 1 { + return ErrAlreadyV1 + } + if version != 2 { + return fmt.Errorf("invalid source version: %v", version) + } + + // Read CARv2 header to locate data payload. + var v2h Header + if _, err := v2h.ReadFrom(src); err != nil { + return err + } + + // TODO consider extracting this into Header.Validate since it is also implemented in BlockReader. + // Validate header + dataOffset := int64(v2h.DataOffset) + if dataOffset < PragmaSize+HeaderSize { + return fmt.Errorf("invalid data payload offset: %v", dataOffset) + } + dataSize := int64(v2h.DataSize) + if dataSize <= 0 { + return fmt.Errorf("invalid data payload size: %v", dataSize) + } + + // Seek to the point where the data payload starts + if _, err := src.Seek(dataOffset, io.SeekStart); err != nil { + return err + } + + // Open destination as late as possible to minimise unintended file creation in case an error + // occurs earlier. + // Note, we explicitly do not use os.O_TRUNC here so that we can support in-place extraction. + // Otherwise, truncation of an existing file will wipe the data we would be reading from if + // source and destination paths are the same. + // Later, we do truncate the file to the right size to assert there are no tailing extra bytes. + dst, err := os.OpenFile(dstPath, os.O_CREATE|os.O_WRONLY, 0o666) + if err != nil { + return err + } + + defer func() { + // Close destination and override return error type if it is nil. + cerr := dst.Close() + if err == nil { + err = cerr + } + }() + + // Copy data payload over, expecting to write exactly the right number of bytes. + // Note that we explicitly use io.CopyN using file descriptors to leverage the SDK's efficient + // byte copy which should stay out of userland. + // There are two benchmarks to measure this: BenchmarkExtractV1File vs. BenchmarkExtractV1UsingReader + written, err := io.CopyN(dst, src, dataSize) + if err != nil { + return err + } + if written != dataSize { + return fmt.Errorf("expected to write exactly %v but wrote %v", dataSize, written) + } + + // Check that the size destination file matches expected size. + // If bigger truncate. + // Note, we need to truncate: + // - if file is changed in-place, i.e. src and dst paths are the same then index or padding + // could be present after the data payload. + // - if an existing file is passed as destination which is different from source and is larger + // than the data payload size. + // In general, we want to guarantee that this function produces correct CARv2 payload in + // destination. + stat, err := dst.Stat() + if err != nil { + return err + } + if stat.Size() > dataSize { + // Truncate to the expected size to assure the resulting file is a correctly sized CARv1. + err = dst.Truncate(written) + } + + return err +} + // AttachIndex attaches a given index to an existing CARv2 file at given path and offset. func AttachIndex(path string, idx index.Index, offset uint64) error { // TODO: instead of offset, maybe take padding? diff --git a/v2/writer_test.go b/v2/writer_test.go index 3cf119cee032a4e97fc5c7984f278e47e58e8b73..c35beb4a425bbc6a5fa9184320c2d2fa5d2b69b3 100644 --- a/v2/writer_test.go +++ b/v2/writer_test.go @@ -59,6 +59,49 @@ func TestWrapV1(t *testing.T) { require.Equal(t, wantIdx, gotIdx) } +func TestExtractV1(t *testing.T) { + // Produce a CARv1 file to test. + dagSvc := dstest.Mock() + v1Src := filepath.Join(t.TempDir(), "original-test-v1.car") + v1f, err := os.Create(v1Src) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v1f.Close()) }) + require.NoError(t, carv1.WriteCar(context.Background(), dagSvc, generateRootCid(t, dagSvc), v1f)) + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + wantV1, err := ioutil.ReadAll(v1f) + require.NoError(t, err) + + // Wrap the produced CARv1 into a CARv2 to use for testing. + v2path := filepath.Join(t.TempDir(), "wrapped-for-extract-test-v2.car") + require.NoError(t, WrapV1File(v1Src, v2path)) + + // Assert extract from CARv2 file is as expected. + dstPath := filepath.Join(t.TempDir(), "extract-file-test-v1.car") + require.NoError(t, ExtractV1File(v2path, dstPath)) + gotFromFile, err := ioutil.ReadFile(dstPath) + require.NoError(t, err) + require.Equal(t, wantV1, gotFromFile) + + // Assert extract from CARv2 file in-place is as expected + require.NoError(t, ExtractV1File(v2path, v2path)) + gotFromInPlaceFile, err := ioutil.ReadFile(v2path) + require.NoError(t, err) + require.Equal(t, wantV1, gotFromInPlaceFile) +} + +func TestExtractV1WithUnknownVersionIsError(t *testing.T) { + dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car") + err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath) + require.EqualError(t, err, "invalid source version: 42") +} + +func TestExtractV1FromACarV1IsError(t *testing.T) { + dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v1.car") + err := ExtractV1File("testdata/sample-v1.car", dstPath) + require.Equal(t, ErrAlreadyV1, err) +} + func generateRootCid(t *testing.T, adder format.NodeAdder) []cid.Cid { // TODO convert this into a utility testing lib that takes an rng and generates a random DAG with some threshold for depth/breadth. this := merkledag.NewRawNode([]byte("fish"))