Commit 81137942 authored by Masih H. Derkani's avatar Masih H. Derkani

Implement utility to extract CARv1 from a CARv2

Implement `ExtractV1File` where the function takes path to a CARv2 file
and efficiently extracts its inner CARv1 payload. Note, the
implementation only supports CARv2 as input and returns a dedicated
error if the supplied input is already in CARv1 format.

Implement benchmarks comparing extraction using `Reader` vs
`ExtractV1File`.

Implement tests that assert in-place extraction as well as invalid input
and both v1/v2 input

Fixes #207
parent 039ddc7c
......@@ -2,12 +2,20 @@ package car_test
import (
"io"
"math/rand"
"os"
"path/filepath"
"testing"
"github.com/ipfs/go-cid"
"github.com/ipfs/go-merkledag"
"github.com/ipld/go-car/v2/blockstore"
carv2 "github.com/ipld/go-car/v2"
)
var rng = rand.New(rand.NewSource(1413))
// BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks.
// It essentially looks at the contents of any CARv1 or CARv2 file.
// Note that this also uses internal carv1.ReadHeader underneath.
......@@ -47,3 +55,93 @@ func BenchmarkReadBlocks(b *testing.B) {
}
})
}
// BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File.
func BenchmarkExtractV1File(b *testing.B) {
path := filepath.Join(b.TempDir(), "bench-large-v2.car")
generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB
defer os.Remove(path)
info, err := os.Stat(path)
if err != nil {
b.Fatal(err)
}
b.SetBytes(info.Size())
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
dstPath := filepath.Join(b.TempDir(), "destination.car")
for pb.Next() {
err = carv2.ExtractV1File(path, dstPath)
if err != nil {
b.Fatal(err)
}
_ = os.Remove(dstPath)
}
})
}
// BenchmarkExtractV1UsingReader extracts inner CARv1 payload from a sample CARv2 file using Reader
// API. This benchmark is implemented to be used as a comparison in conjunction with
// BenchmarkExtractV1File.
func BenchmarkExtractV1UsingReader(b *testing.B) {
path := filepath.Join(b.TempDir(), "bench-large-v2.car")
generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB
defer os.Remove(path)
info, err := os.Stat(path)
if err != nil {
b.Fatal(err)
}
b.SetBytes(info.Size())
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
dstPath := filepath.Join(b.TempDir(), "destination.car")
for pb.Next() {
dst, err := os.Create(dstPath)
if err != nil {
b.Fatal(err)
}
reader, err := carv2.OpenReader(path)
if err != nil {
b.Fatal(err)
}
_, err = io.Copy(dst, reader.DataReader())
if err != nil {
b.Fatal(err)
}
if err := dst.Close(); err != nil {
b.Fatal(err)
}
}
})
}
func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) {
bs, err := blockstore.OpenReadWrite(path, []cid.Cid{})
defer func() {
if err := bs.Finalize(); err != nil {
b.Fatal(err)
}
}()
if err != nil {
b.Fatal(err)
}
buf := make([]byte, 1024)
var totalBlockSize int
for totalBlockSize < minTotalBlockSize {
size, err := rng.Read(buf)
if err != nil {
b.Fatal(err)
}
blk := merkledag.NewRawNode(buf)
if err := bs.Put(blk); err != nil {
b.Fatal(err)
}
totalBlockSize += size
}
}
package car
import (
"errors"
"fmt"
"io"
"os"
......@@ -9,6 +11,9 @@ import (
"github.com/ipld/go-car/v2/index"
)
// ErrAlreadyV1 signals that the given payload is already in CARv1 format.
var ErrAlreadyV1 = errors.New("already a CARv1")
// WrapV1File is a wrapper around WrapV1 that takes filesystem paths.
// The source path is assumed to exist, and the destination path is overwritten.
// Note that the destination path might still be created even if an error
......@@ -79,6 +84,109 @@ func WrapV1(src io.ReadSeeker, dst io.Writer) error {
return nil
}
// ExtractV1File takes a CARv2 file and extracts its CARv1 data payload, unmodified.
// The resulting CARv1 file will not include any data payload padding that may be present in the
// CARv2 srcPath.
// If srcPath represents a CARv1 ErrAlreadyV1 error is returned.
// The srcPath is assumed to exist, and the destination path is created if not exist.
// Note that the destination path might still be created even if an error
// occurred.
// If srcPath and dstPath are the same, then the dstPath is converted, in-place, to CARv1.
func ExtractV1File(srcPath, dstPath string) (err error) {
src, err := os.Open(srcPath)
if err != nil {
return err
}
// Ignore close error since only reading from src.
defer src.Close()
// Detect CAR version.
version, err := ReadVersion(src)
if err != nil {
return err
}
if version == 1 {
return ErrAlreadyV1
}
if version != 2 {
return fmt.Errorf("invalid source version: %v", version)
}
// Read CARv2 header to locate data payload.
var v2h Header
if _, err := v2h.ReadFrom(src); err != nil {
return err
}
// TODO consider extracting this into Header.Validate since it is also implemented in BlockReader.
// Validate header
dataOffset := int64(v2h.DataOffset)
if dataOffset < PragmaSize+HeaderSize {
return fmt.Errorf("invalid data payload offset: %v", dataOffset)
}
dataSize := int64(v2h.DataSize)
if dataSize <= 0 {
return fmt.Errorf("invalid data payload size: %v", dataSize)
}
// Seek to the point where the data payload starts
if _, err := src.Seek(dataOffset, io.SeekStart); err != nil {
return err
}
// Open destination as late as possible to minimise unintended file creation in case an error
// occurs earlier.
// Note, we explicitly do not use os.O_TRUNC here so that we can support in-place extraction.
// Otherwise, truncation of an existing file will wipe the data we would be reading from if
// source and destination paths are the same.
// Later, we do truncate the file to the right size to assert there are no tailing extra bytes.
dst, err := os.OpenFile(dstPath, os.O_CREATE|os.O_WRONLY, 0o666)
if err != nil {
return err
}
defer func() {
// Close destination and override return error type if it is nil.
cerr := dst.Close()
if err == nil {
err = cerr
}
}()
// Copy data payload over, expecting to write exactly the right number of bytes.
// Note that we explicitly use io.CopyN using file descriptors to leverage the SDK's efficient
// byte copy which should stay out of userland.
// There are two benchmarks to measure this: BenchmarkExtractV1File vs. BenchmarkExtractV1UsingReader
written, err := io.CopyN(dst, src, dataSize)
if err != nil {
return err
}
if written != dataSize {
return fmt.Errorf("expected to write exactly %v but wrote %v", dataSize, written)
}
// Check that the size destination file matches expected size.
// If bigger truncate.
// Note, we need to truncate:
// - if file is changed in-place, i.e. src and dst paths are the same then index or padding
// could be present after the data payload.
// - if an existing file is passed as destination which is different from source and is larger
// than the data payload size.
// In general, we want to guarantee that this function produces correct CARv2 payload in
// destination.
stat, err := dst.Stat()
if err != nil {
return err
}
if stat.Size() > dataSize {
// Truncate to the expected size to assure the resulting file is a correctly sized CARv1.
err = dst.Truncate(written)
}
return err
}
// AttachIndex attaches a given index to an existing CARv2 file at given path and offset.
func AttachIndex(path string, idx index.Index, offset uint64) error {
// TODO: instead of offset, maybe take padding?
......
......@@ -59,6 +59,49 @@ func TestWrapV1(t *testing.T) {
require.Equal(t, wantIdx, gotIdx)
}
func TestExtractV1(t *testing.T) {
// Produce a CARv1 file to test.
dagSvc := dstest.Mock()
v1Src := filepath.Join(t.TempDir(), "original-test-v1.car")
v1f, err := os.Create(v1Src)
require.NoError(t, err)
t.Cleanup(func() { require.NoError(t, v1f.Close()) })
require.NoError(t, carv1.WriteCar(context.Background(), dagSvc, generateRootCid(t, dagSvc), v1f))
_, err = v1f.Seek(0, io.SeekStart)
require.NoError(t, err)
wantV1, err := ioutil.ReadAll(v1f)
require.NoError(t, err)
// Wrap the produced CARv1 into a CARv2 to use for testing.
v2path := filepath.Join(t.TempDir(), "wrapped-for-extract-test-v2.car")
require.NoError(t, WrapV1File(v1Src, v2path))
// Assert extract from CARv2 file is as expected.
dstPath := filepath.Join(t.TempDir(), "extract-file-test-v1.car")
require.NoError(t, ExtractV1File(v2path, dstPath))
gotFromFile, err := ioutil.ReadFile(dstPath)
require.NoError(t, err)
require.Equal(t, wantV1, gotFromFile)
// Assert extract from CARv2 file in-place is as expected
require.NoError(t, ExtractV1File(v2path, v2path))
gotFromInPlaceFile, err := ioutil.ReadFile(v2path)
require.NoError(t, err)
require.Equal(t, wantV1, gotFromInPlaceFile)
}
func TestExtractV1WithUnknownVersionIsError(t *testing.T) {
dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car")
err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath)
require.EqualError(t, err, "invalid source version: 42")
}
func TestExtractV1FromACarV1IsError(t *testing.T) {
dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v1.car")
err := ExtractV1File("testdata/sample-v1.car", dstPath)
require.Equal(t, ErrAlreadyV1, err)
}
func generateRootCid(t *testing.T, adder format.NodeAdder) []cid.Cid {
// TODO convert this into a utility testing lib that takes an rng and generates a random DAG with some threshold for depth/breadth.
this := merkledag.NewRawNode([]byte("fish"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment