Commit c514a301 authored by Masih H. Derkani's avatar Masih H. Derkani

Document performance caveats of `ExtractV1File` and address comments

Since `CopyFileRange` performance is OS-dependant, we cannot guarantee
that `ExtractV1File` will keep copies out of user-space. For example, on
Linux with sufficiently old Kernel the current behaviour will fall back
to user-space copy. Document this on the function so that it is made
clear.

Improve benchmarking determinism and error messages.

The benchmark numbers that Daniel obtained on his laptop,
running Linux 5.13 on an i5-8350U with /tmp being tmpfs,
are as follows.
The "old" results are BenchmarkExtractV1UsingReader,
and the "new" are BenchmarkExtractV1File.

	name         old time/op    new time/op    delta
	ExtractV1-8    1.33ms ± 1%    1.11ms ± 2%  -16.48%  (p=0.000 n=8+8)

	name         old speed      new speed      delta
	ExtractV1-8  7.88GB/s ± 1%  9.43GB/s ± 2%  +19.74%  (p=0.000 n=8+8)

	name         old alloc/op   new alloc/op   delta
	ExtractV1-8    34.0kB ± 0%     1.0kB ± 0%  -97.09%  (p=0.000 n=8+8)

	name         old allocs/op  new allocs/op  delta
	ExtractV1-8      26.0 ± 0%      23.0 ± 0%  -11.54%  (p=0.000 n=8+8)

So, at least in the case where the filesystem is very fast,
we can see that the benefit is around 10-20%,
as well as fewer allocs thanks to not needing a user-space buffer.
The performance benefit will likely be smaller on slower disks.

For the Linux syscall logic, see:
- https://cs.opensource.google/go/go/+/refs/tags/go1.16.7:src/internal/poll/copy_file_range_linux.go;drc=refs%2Ftags%2Fgo1.16.7;l=54
parent 81137942
...@@ -14,8 +14,6 @@ import ( ...@@ -14,8 +14,6 @@ import (
carv2 "github.com/ipld/go-car/v2" carv2 "github.com/ipld/go-car/v2"
) )
var rng = rand.New(rand.NewSource(1413))
// BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks. // BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks.
// It essentially looks at the contents of any CARv1 or CARv2 file. // It essentially looks at the contents of any CARv1 or CARv2 file.
// Note that this also uses internal carv1.ReadHeader underneath. // Note that this also uses internal carv1.ReadHeader underneath.
...@@ -59,7 +57,7 @@ func BenchmarkReadBlocks(b *testing.B) { ...@@ -59,7 +57,7 @@ func BenchmarkReadBlocks(b *testing.B) {
// BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File. // BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File.
func BenchmarkExtractV1File(b *testing.B) { func BenchmarkExtractV1File(b *testing.B) {
path := filepath.Join(b.TempDir(), "bench-large-v2.car") path := filepath.Join(b.TempDir(), "bench-large-v2.car")
generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB generateRandomCarV2File(b, path, 10<<20) // 10 MiB
defer os.Remove(path) defer os.Remove(path)
info, err := os.Stat(path) info, err := os.Stat(path)
...@@ -87,7 +85,7 @@ func BenchmarkExtractV1File(b *testing.B) { ...@@ -87,7 +85,7 @@ func BenchmarkExtractV1File(b *testing.B) {
// BenchmarkExtractV1File. // BenchmarkExtractV1File.
func BenchmarkExtractV1UsingReader(b *testing.B) { func BenchmarkExtractV1UsingReader(b *testing.B) {
path := filepath.Join(b.TempDir(), "bench-large-v2.car") path := filepath.Join(b.TempDir(), "bench-large-v2.car")
generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB generateRandomCarV2File(b, path, 10<<20) // 10 MiB
defer os.Remove(path) defer os.Remove(path)
info, err := os.Stat(path) info, err := os.Stat(path)
...@@ -121,6 +119,8 @@ func BenchmarkExtractV1UsingReader(b *testing.B) { ...@@ -121,6 +119,8 @@ func BenchmarkExtractV1UsingReader(b *testing.B) {
} }
func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) { func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) {
// Use fixed RNG for determinism across benchmarks.
rng := rand.New(rand.NewSource(1413))
bs, err := blockstore.OpenReadWrite(path, []cid.Cid{}) bs, err := blockstore.OpenReadWrite(path, []cid.Cid{})
defer func() { defer func() {
if err := bs.Finalize(); err != nil { if err := bs.Finalize(); err != nil {
...@@ -130,7 +130,7 @@ func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) { ...@@ -130,7 +130,7 @@ func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) {
if err != nil { if err != nil {
b.Fatal(err) b.Fatal(err)
} }
buf := make([]byte, 1024) buf := make([]byte, 32<<10) // 32 KiB
var totalBlockSize int var totalBlockSize int
for totalBlockSize < minTotalBlockSize { for totalBlockSize < minTotalBlockSize {
size, err := rng.Read(buf) size, err := rng.Read(buf)
......
...@@ -92,6 +92,12 @@ func WrapV1(src io.ReadSeeker, dst io.Writer) error { ...@@ -92,6 +92,12 @@ func WrapV1(src io.ReadSeeker, dst io.Writer) error {
// Note that the destination path might still be created even if an error // Note that the destination path might still be created even if an error
// occurred. // occurred.
// If srcPath and dstPath are the same, then the dstPath is converted, in-place, to CARv1. // If srcPath and dstPath are the same, then the dstPath is converted, in-place, to CARv1.
//
// This function aims to extract the CARv1 payload as efficiently as possible.
// The method is best-effort and depends on your operating system;
// for example, it should use copy_file_range on recent Linux versions.
// This API should be preferred over copying directly via Reader.DataReader,
// as it should allow for better performance while always being at least as efficient.
func ExtractV1File(srcPath, dstPath string) (err error) { func ExtractV1File(srcPath, dstPath string) (err error) {
src, err := os.Open(srcPath) src, err := os.Open(srcPath)
if err != nil { if err != nil {
...@@ -110,7 +116,7 @@ func ExtractV1File(srcPath, dstPath string) (err error) { ...@@ -110,7 +116,7 @@ func ExtractV1File(srcPath, dstPath string) (err error) {
return ErrAlreadyV1 return ErrAlreadyV1
} }
if version != 2 { if version != 2 {
return fmt.Errorf("invalid source version: %v", version) return fmt.Errorf("source version must be 2; got: %d", version)
} }
// Read CARv2 header to locate data payload. // Read CARv2 header to locate data payload.
...@@ -123,11 +129,11 @@ func ExtractV1File(srcPath, dstPath string) (err error) { ...@@ -123,11 +129,11 @@ func ExtractV1File(srcPath, dstPath string) (err error) {
// Validate header // Validate header
dataOffset := int64(v2h.DataOffset) dataOffset := int64(v2h.DataOffset)
if dataOffset < PragmaSize+HeaderSize { if dataOffset < PragmaSize+HeaderSize {
return fmt.Errorf("invalid data payload offset: %v", dataOffset) return fmt.Errorf("invalid data payload offset: %d", dataOffset)
} }
dataSize := int64(v2h.DataSize) dataSize := int64(v2h.DataSize)
if dataSize <= 0 { if dataSize <= 0 {
return fmt.Errorf("invalid data payload size: %v", dataSize) return fmt.Errorf("invalid data payload size: %d", dataSize)
} }
// Seek to the point where the data payload starts // Seek to the point where the data payload starts
...@@ -163,7 +169,7 @@ func ExtractV1File(srcPath, dstPath string) (err error) { ...@@ -163,7 +169,7 @@ func ExtractV1File(srcPath, dstPath string) (err error) {
return err return err
} }
if written != dataSize { if written != dataSize {
return fmt.Errorf("expected to write exactly %v but wrote %v", dataSize, written) return fmt.Errorf("expected to write exactly %d but wrote %d", dataSize, written)
} }
// Check that the size destination file matches expected size. // Check that the size destination file matches expected size.
......
...@@ -93,7 +93,7 @@ func TestExtractV1(t *testing.T) { ...@@ -93,7 +93,7 @@ func TestExtractV1(t *testing.T) {
func TestExtractV1WithUnknownVersionIsError(t *testing.T) { func TestExtractV1WithUnknownVersionIsError(t *testing.T) {
dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car") dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car")
err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath) err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath)
require.EqualError(t, err, "invalid source version: 42") require.EqualError(t, err, "source version must be 2; got: 42")
} }
func TestExtractV1FromACarV1IsError(t *testing.T) { func TestExtractV1FromACarV1IsError(t *testing.T) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment