diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..4b86d7197f9a6f1a988c503defc8451392cd5e55 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Getting Help on IPFS + url: https://ipfs.io/help + about: All information about how and where to get help on IPFS. + - name: IPFS Official Forum + url: https://discuss.ipfs.io + about: Please post general questions, support requests, and discussions here. diff --git a/.github/ISSUE_TEMPLATE/open_an_issue.md b/.github/ISSUE_TEMPLATE/open_an_issue.md new file mode 100644 index 0000000000000000000000000000000000000000..4fcbd00aca0d513c2729d8df4afb5a01fdbe7d02 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/open_an_issue.md @@ -0,0 +1,19 @@ +--- +name: Open an issue +about: Only for actionable issues relevant to this repository. +title: '' +labels: need/triage +assignees: '' + +--- + diff --git a/.github/config.yml b/.github/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..ed26646a0f7cdda6cf10ede2b0b98cac89cf67b0 --- /dev/null +++ b/.github/config.yml @@ -0,0 +1,68 @@ +# Configuration for welcome - https://github.com/behaviorbot/welcome + +# Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome +# Comment to be posted to on first time issues +newIssueWelcomeComment: > + Thank you for submitting your first issue to this repository! A maintainer + will be here shortly to triage and review. + + In the meantime, please double-check that you have provided all the + necessary information to make this process easy! Any information that can + help save additional round trips is useful! We currently aim to give + initial feedback within **two business days**. If this does not happen, feel + free to leave a comment. + + Please keep an eye on how this issue will be labeled, as labels give an + overview of priorities, assignments and additional actions requested by the + maintainers: + + - "Priority" labels will show how urgent this is for the team. + - "Status" labels will show if this is ready to be worked on, blocked, or in progress. + - "Need" labels will indicate if additional input or analysis is required. + + Finally, remember to use https://discuss.ipfs.io if you just need general + support. + +# Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome +# Comment to be posted to on PRs from first time contributors in your repository +newPRWelcomeComment: > + Thank you for submitting this PR! + + A maintainer will be here shortly to review it. + + We are super grateful, but we are also overloaded! Help us by making sure + that: + + * The context for this PR is clear, with relevant discussion, decisions + and stakeholders linked/mentioned. + + * Your contribution itself is clear (code comments, self-review for the + rest) and in its best form. Follow the [code contribution + guidelines](https://github.com/ipfs/community/blob/master/CONTRIBUTING.md#code-contribution-guidelines) + if they apply. + + Getting other community members to do a review would be great help too on + complex PRs (you can ask in the chats/forums). If you are unsure about + something, just leave us a comment. + + Next steps: + + * A maintainer will triage and assign priority to this PR, commenting on + any missing things and potentially assigning a reviewer for high + priority items. + + * The PR gets reviews, discussed and approvals as needed. + + * The PR is merged by maintainers when it has been approved and comments addressed. + + We currently aim to provide initial feedback/triaging within **two business + days**. Please keep an eye on any labelling actions, as these will indicate + priorities and status of your contribution. + + We are very grateful for your contribution! + + +# Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge +# Comment to be posted to on pull requests merged by a first time user +# Currently disabled +#firstPRMergeComment: "" diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml new file mode 100644 index 0000000000000000000000000000000000000000..840618d88f2fdc33972facbed16a84929643bf6f --- /dev/null +++ b/.github/workflows/automerge.yml @@ -0,0 +1,27 @@ +# File managed by web3-bot. DO NOT EDIT. +# See https://github.com/protocol/.github/ for details. + +# Automatically merge pull requests opened by web3-bot, as soon as (and only if) all tests pass. +# This reduces the friction associated with updating with our workflows. + +on: [ pull_request ] + +jobs: + automerge: + if: github.event.pull_request.user.login == 'web3-bot' + runs-on: ubuntu-latest + steps: + - name: Wait on tests + uses: lewagon/wait-on-check-action@bafe56a6863672c681c3cf671f5e10b20abf2eaa # v0.2 + with: + ref: ${{ github.event.pull_request.head.sha }} + repo-token: ${{ secrets.GITHUB_TOKEN }} + wait-interval: 10 + running-workflow-name: 'automerge' # the name of this job + - name: Merge PR + uses: pascalgn/automerge-action@741c311a47881be9625932b0a0de1b0937aab1ae # v0.13.1 + env: + GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + MERGE_LABELS: "" + MERGE_METHOD: "squash" + MERGE_DELETE_BRANCH: true diff --git a/.github/workflows/go-check.yml b/.github/workflows/go-check.yml new file mode 100644 index 0000000000000000000000000000000000000000..862d49fc9f3443177b1d3e78e158f24340a3e980 --- /dev/null +++ b/.github/workflows/go-check.yml @@ -0,0 +1,41 @@ +# File managed by web3-bot. DO NOT EDIT. +# See https://github.com/protocol/.github/ for details. + +on: [push, pull_request] + +jobs: + unit: + runs-on: ubuntu-latest + name: Go checks + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: "1.16.x" + - name: Install staticcheck + run: go install honnef.co/go/tools/cmd/staticcheck@be534f007836a777104a15f2456cd1fffd3ddee8 # v2020.2.2 + - name: Check that go.mod is tidy + run: | + go mod tidy + if [[ -n $(git ls-files --other --exclude-standard --directory -- go.sum) ]]; then + echo "go.sum was added by go mod tidy" + exit 1 + fi + git diff --exit-code -- go.sum go.mod + - name: gofmt + if: ${{ success() || failure() }} # run this step even if the previous one failed + run: | + out=$(gofmt -s -l .) + if [[ -n "$out" ]]; then + echo $out | awk '{print "::error file=" $0 ",line=0,col=0::File is not gofmt-ed."}' + exit 1 + fi + - name: go vet + if: ${{ success() || failure() }} # run this step even if the previous one failed + run: go vet ./... + - name: staticcheck + if: ${{ success() || failure() }} # run this step even if the previous one failed + run: | + set -o pipefail + staticcheck ./... | sed -e 's@\(.*\)\.go@./\1.go@g' + diff --git a/.github/workflows/go-test.yml b/.github/workflows/go-test.yml new file mode 100644 index 0000000000000000000000000000000000000000..9b384208624b2615489e028c4cec15efd1557ee3 --- /dev/null +++ b/.github/workflows/go-test.yml @@ -0,0 +1,38 @@ +# File managed by web3-bot. DO NOT EDIT. +# See https://github.com/protocol/.github/ for details. + +on: [push, pull_request] + +jobs: + unit: + strategy: + fail-fast: false + matrix: + os: [ "ubuntu", "windows", "macos" ] + go: [ "1.15.x", "1.16.x" ] + runs-on: ${{ matrix.os }}-latest + name: Unit tests (${{ matrix.os}}, Go ${{ matrix.go }}) + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go }} + - name: Go information + run: | + go version + go env + - name: Run tests + run: go test -v -coverprofile coverage.txt ./... + - name: Run tests (32 bit) + if: ${{ matrix.os != 'macos' }} # can't run 32 bit tests on OSX. + env: + GOARCH: 386 + run: go test -v ./... + - name: Run tests with race detector + if: ${{ matrix.os == 'ubuntu' }} # speed things up. Windows and OSX VMs are slow + run: go test -v -race ./... + - name: Upload coverage to Codecov + uses: codecov/codecov-action@967e2b38a85a62bd61be5529ada27ebc109948c2 # v1.4.1 + with: + file: coverage.txt + env_vars: OS=${{ matrix.os }}, GO=${{ matrix.go }} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e4224df5b7a109818bddc7862137f495f2c2b82e --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 IPFS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 2bb06ebc56a7c6d4fd41a3d0c682330b5025c7a8..7b1b5b2389ef60543549e62f77d421d01a118219 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,50 @@ -# go-dms3-chunker +# go-ipfs-chunker +[![](https://img.shields.io/badge/made%20by-Protocol%20Labs-blue.svg?style=flat-square)](http://ipn.io) +[![](https://img.shields.io/badge/project-IPFS-blue.svg?style=flat-square)](http://ipfs.io/) +[![standard-readme compliant](https://img.shields.io/badge/standard--readme-OK-green.svg?style=flat-square)](https://github.com/RichardLitt/standard-readme) +[![GoDoc](https://godoc.org/github.com/ipfs/go-ipfs-chunker?status.svg)](https://godoc.org/github.com/ipfs/go-ipfs-chunker) +[![Build Status](https://travis-ci.org/ipfs/go-ipfs-chunker.svg?branch=master)](https://travis-ci.org/ipfs/go-ipfs-chunker) + +> go-ipfs-chunker implements data Splitters for go-ipfs. + +`go-ipfs-chunker` provides the `Splitter` interface. IPFS splitters read data from a reader an create "chunks". These chunks are used to build the ipfs DAGs (Merkle Tree) and are the base unit to obtain the sums that ipfs uses to address content. + +The package provides a `SizeSplitter` which creates chunks of equal size and it is used by default in most cases, and a `rabin` fingerprint chunker. This chunker will attempt to split data in a way that the resulting blocks are the same when the data has repetitive patterns, thus optimizing the resulting DAGs. + +## Lead Maintainer + +[Steven Allen](https://github.com/Stebalien) + +## Table of Contents + +- [Install](#install) +- [Usage](#usage) +- [Contribute](#contribute) +- [License](#license) + +## Install + +`go-ipfs-chunker` works like a regular Go module: + +``` +> go get github.com/ipfs/go-ipfs-chunker +``` + +## Usage + +``` +import "github.com/ipfs/go-ipfs-chunker" +``` + +Check the [GoDoc documentation](https://godoc.org/github.com/ipfs/go-ipfs-chunker) + +## Contribute + +PRs accepted. + +Small note: If editing the README, please conform to the [standard-readme](https://github.com/RichardLitt/standard-readme) specification. + +## License + +MIT © Protocol Labs, Inc. diff --git a/benchmark_test.go b/benchmark_test.go new file mode 100644 index 0000000000000000000000000000000000000000..5069b06536e15084cb7ae68d2d52792719ed4b30 --- /dev/null +++ b/benchmark_test.go @@ -0,0 +1,59 @@ +package chunk + +import ( + "bytes" + "io" + "math/rand" + "testing" +) + +type newSplitter func(io.Reader) Splitter + +type bencSpec struct { + size int + name string +} + +var bSizes = []bencSpec{ + {1 << 10, "1K"}, + {1 << 20, "1M"}, + {16 << 20, "16M"}, + {100 << 20, "100M"}, +} + +func benchmarkChunker(b *testing.B, ns newSplitter) { + for _, s := range bSizes { + s := s + b.Run(s.name, func(b *testing.B) { + benchmarkChunkerSize(b, ns, s.size) + }) + } +} + +func benchmarkChunkerSize(b *testing.B, ns newSplitter, size int) { + rng := rand.New(rand.NewSource(1)) + data := make([]byte, size) + rng.Read(data) + + b.SetBytes(int64(size)) + b.ReportAllocs() + b.ResetTimer() + + var res uint64 + + for i := 0; i < b.N; i++ { + r := ns(bytes.NewReader(data)) + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + b.Fatal(err) + } + res = res + uint64(len(chunk)) + } + } + Res = Res + res +} diff --git a/buzhash.go b/buzhash.go new file mode 100644 index 0000000000000000000000000000000000000000..83ab019dd48c257e9eb2199a56514646209f838e --- /dev/null +++ b/buzhash.go @@ -0,0 +1,151 @@ +package chunk + +import ( + "io" + "math/bits" + + pool "github.com/libp2p/go-buffer-pool" +) + +const ( + buzMin = 128 << 10 + buzMax = 512 << 10 + buzMask = 1<<17 - 1 +) + +type Buzhash struct { + r io.Reader + buf []byte + n int + + err error +} + +func NewBuzhash(r io.Reader) *Buzhash { + return &Buzhash{ + r: r, + buf: pool.Get(buzMax), + } +} + +func (b *Buzhash) Reader() io.Reader { + return b.r +} + +func (b *Buzhash) NextBytes() ([]byte, error) { + if b.err != nil { + return nil, b.err + } + + n, err := io.ReadFull(b.r, b.buf[b.n:]) + if err != nil { + if err == io.ErrUnexpectedEOF || err == io.EOF { + buffered := b.n + n + if buffered < buzMin { + b.err = io.EOF + // Read nothing? Don't return an empty block. + if buffered == 0 { + pool.Put(b.buf) + b.buf = nil + return nil, b.err + } + res := make([]byte, buffered) + copy(res, b.buf) + + pool.Put(b.buf) + b.buf = nil + return res, nil + } + } else { + b.err = err + pool.Put(b.buf) + b.buf = nil + return nil, err + } + } + + i := buzMin - 32 + + var state uint32 = 0 + + if buzMin > len(b.buf) { + panic("this is impossible") + } + + for ; i < buzMin; i++ { + state = bits.RotateLeft32(state, 1) + state = state ^ bytehash[b.buf[i]] + } + + { + max := b.n + n - 32 - 1 + + buf := b.buf + bufshf := b.buf[32:] + i = buzMin - 32 + _ = buf[max] + _ = bufshf[max] + + for ; i <= max; i++ { + if state&buzMask == 0 { + break + } + state = bits.RotateLeft32(state, 1) ^ + bytehash[buf[i]] ^ + bytehash[bufshf[i]] + } + i += 32 + } + + res := make([]byte, i) + copy(res, b.buf) + + b.n = copy(b.buf, b.buf[i:b.n+n]) + + return res, nil +} + +var bytehash = [256]uint32{ + 0x6236e7d5, 0x10279b0b, 0x72818182, 0xdc526514, 0x2fd41e3d, 0x777ef8c8, + 0x83ee5285, 0x2c8f3637, 0x2f049c1a, 0x57df9791, 0x9207151f, 0x9b544818, + 0x74eef658, 0x2028ca60, 0x0271d91a, 0x27ae587e, 0xecf9fa5f, 0x236e71cd, + 0xf43a8a2e, 0xbb13380, 0x9e57912c, 0x89a26cdb, 0x9fcf3d71, 0xa86da6f1, + 0x9c49f376, 0x346aecc7, 0xf094a9ee, 0xea99e9cb, 0xb01713c6, 0x88acffb, + 0x2960a0fb, 0x344a626c, 0x7ff22a46, 0x6d7a1aa5, 0x6a714916, 0x41d454ca, + 0x8325b830, 0xb65f563, 0x447fecca, 0xf9d0ea5e, 0xc1d9d3d4, 0xcb5ec574, + 0x55aae902, 0x86edc0e7, 0xd3a9e33, 0xe70dc1e1, 0xe3c5f639, 0x9b43140a, + 0xc6490ac5, 0x5e4030fb, 0x8e976dd5, 0xa87468ea, 0xf830ef6f, 0xcc1ed5a5, + 0x611f4e78, 0xddd11905, 0xf2613904, 0x566c67b9, 0x905a5ccc, 0x7b37b3a4, + 0x4b53898a, 0x6b8fd29d, 0xaad81575, 0x511be414, 0x3cfac1e7, 0x8029a179, + 0xd40efeda, 0x7380e02, 0xdc9beffd, 0x2d049082, 0x99bc7831, 0xff5002a8, + 0x21ce7646, 0x1cd049b, 0xf43994f, 0xc3c6c5a5, 0xbbda5f50, 0xec15ec7, + 0x9adb19b6, 0xc1e80b9, 0xb9b52968, 0xae162419, 0x2542b405, 0x91a42e9d, + 0x6be0f668, 0x6ed7a6b9, 0xbc2777b4, 0xe162ce56, 0x4266aad5, 0x60fdb704, + 0x66f832a5, 0x9595f6ca, 0xfee83ced, 0x55228d99, 0x12bf0e28, 0x66896459, + 0x789afda, 0x282baa8, 0x2367a343, 0x591491b0, 0x2ff1a4b1, 0x410739b6, + 0x9b7055a0, 0x2e0eb229, 0x24fc8252, 0x3327d3df, 0xb0782669, 0x1c62e069, + 0x7f503101, 0xf50593ae, 0xd9eb275d, 0xe00eb678, 0x5917ccde, 0x97b9660a, + 0xdd06202d, 0xed229e22, 0xa9c735bf, 0xd6316fe6, 0x6fc72e4c, 0x206dfa2, + 0xd6b15c5a, 0x69d87b49, 0x9c97745, 0x13445d61, 0x35a975aa, 0x859aa9b9, + 0x65380013, 0xd1fb6391, 0xc29255fd, 0x784a3b91, 0xb9e74c26, 0x63ce4d40, + 0xc07cbe9e, 0xe6e4529e, 0xfb3632f, 0x9438d9c9, 0x682f94a8, 0xf8fd4611, + 0x257ec1ed, 0x475ce3d6, 0x60ee2db1, 0x2afab002, 0x2b9e4878, 0x86b340de, + 0x1482fdca, 0xfe41b3bf, 0xd4a412b0, 0xe09db98c, 0xc1af5d53, 0x7e55e25f, + 0xd3346b38, 0xb7a12cbd, 0x9c6827ba, 0x71f78bee, 0x8c3a0f52, 0x150491b0, + 0xf26de912, 0x233e3a4e, 0xd309ebba, 0xa0a9e0ff, 0xca2b5921, 0xeeb9893c, + 0x33829e88, 0x9870cc2a, 0x23c4b9d0, 0xeba32ea3, 0xbdac4d22, 0x3bc8c44c, + 0x1e8d0397, 0xf9327735, 0x783b009f, 0xeb83742, 0x2621dc71, 0xed017d03, + 0x5c760aa1, 0x5a69814b, 0x96e3047f, 0xa93c9cde, 0x615c86f5, 0xb4322aa5, + 0x4225534d, 0xd2e2de3, 0xccfccc4b, 0xbac2a57, 0xf0a06d04, 0xbc78d737, + 0xf2d1f766, 0xf5a7953c, 0xbcdfda85, 0x5213b7d5, 0xbce8a328, 0xd38f5f18, + 0xdb094244, 0xfe571253, 0x317fa7ee, 0x4a324f43, 0x3ffc39d9, 0x51b3fa8e, + 0x7a4bee9f, 0x78bbc682, 0x9f5c0350, 0x2fe286c, 0x245ab686, 0xed6bf7d7, + 0xac4988a, 0x3fe010fa, 0xc65fe369, 0xa45749cb, 0x2b84e537, 0xde9ff363, + 0x20540f9a, 0xaa8c9b34, 0x5bc476b3, 0x1d574bd7, 0x929100ad, 0x4721de4d, + 0x27df1b05, 0x58b18546, 0xb7e76764, 0xdf904e58, 0x97af57a1, 0xbd4dc433, + 0xa6256dfd, 0xf63998f3, 0xf1e05833, 0xe20acf26, 0xf57fd9d6, 0x90300b4d, + 0x89df4290, 0x68d01cbc, 0xcf893ee3, 0xcc42a046, 0x778e181b, 0x67265c76, + 0xe981a4c4, 0x82991da1, 0x708f7294, 0xe6e2ae62, 0xfc441870, 0x95e1b0b6, + 0x445f825, 0x5a93b47f, 0x5e9cf4be, 0x84da71e7, 0x9d9582b0, 0x9bf835ef, + 0x591f61e2, 0x43325985, 0x5d2de32e, 0x8d8fbf0f, 0x95b30f38, 0x7ad5b6e, + 0x4e934edf, 0x3cd4990e, 0x9053e259, 0x5c41857d} diff --git a/buzhash_norace_test.go b/buzhash_norace_test.go new file mode 100644 index 0000000000000000000000000000000000000000..2565a4c5338fe3303e529df23871ee652e76bca2 --- /dev/null +++ b/buzhash_norace_test.go @@ -0,0 +1,14 @@ +//+build !race + +package chunk + +import ( + "testing" +) + +func TestFuzzBuzhashChunking(t *testing.T) { + buf := make([]byte, 1024*1024*16) + for i := 0; i < 100; i++ { + testBuzhashChunking(t, buf) + } +} diff --git a/buzhash_test.go b/buzhash_test.go new file mode 100644 index 0000000000000000000000000000000000000000..05ad7c380fec042c2a10a19a18293c868e8dcf73 --- /dev/null +++ b/buzhash_test.go @@ -0,0 +1,91 @@ +package chunk + +import ( + "bytes" + "io" + "testing" + + util "github.com/ipfs/go-ipfs-util" +) + +func testBuzhashChunking(t *testing.T, buf []byte) (chunkCount int) { + n, err := util.NewTimeSeededRand().Read(buf) + if n < len(buf) { + t.Fatalf("expected %d bytes, got %d", len(buf), n) + } + if err != nil { + t.Fatal(err) + } + + r := NewBuzhash(bytes.NewReader(buf)) + + var chunks [][]byte + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + chunks = append(chunks, chunk) + } + chunkCount += len(chunks) + + for i, chunk := range chunks { + if len(chunk) == 0 { + t.Fatalf("chunk %d/%d is empty", i+1, len(chunks)) + } + } + + for i, chunk := range chunks[:len(chunks)-1] { + if len(chunk) < buzMin { + t.Fatalf("chunk %d/%d is less than the minimum size", i+1, len(chunks)) + } + } + + unchunked := bytes.Join(chunks, nil) + if !bytes.Equal(unchunked, buf) { + t.Fatal("data was chunked incorrectly") + } + + return chunkCount +} + +func TestBuzhashChunking(t *testing.T) { + buf := make([]byte, 1024*1024*16) + count := testBuzhashChunking(t, buf) + t.Logf("average block size: %d\n", len(buf)/count) +} + +func TestBuzhashChunkReuse(t *testing.T) { + newBuzhash := func(r io.Reader) Splitter { + return NewBuzhash(r) + } + testReuse(t, newBuzhash) +} + +func BenchmarkBuzhash2(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return NewBuzhash(r) + }) +} + +func TestBuzhashBitsHashBias(t *testing.T) { + counts := make([]byte, 32) + for _, h := range bytehash { + for i := 0; i < 32; i++ { + if h&1 == 1 { + counts[i]++ + } + h = h >> 1 + } + } + for i, c := range counts { + if c != 128 { + t.Errorf("Bit balance in position %d broken, %d ones", i, c) + } + } +} diff --git a/gen/main.go b/gen/main.go new file mode 100644 index 0000000000000000000000000000000000000000..9d908544b8795b5983772430aeb92c67c68aa835 --- /dev/null +++ b/gen/main.go @@ -0,0 +1,33 @@ +// This file generates bytehash LUT +package main + +import ( + "fmt" + "math/rand" +) + +const nRounds = 200 + +func main() { + rnd := rand.New(rand.NewSource(0)) + + lut := make([]uint32, 256) + for i := 0; i < 256/2; i++ { + lut[i] = 1<<32 - 1 + } + + for r := 0; r < nRounds; r++ { + for b := uint32(0); b < 32; b++ { + mask := uint32(1) << b + nmask := ^mask + for i, j := range rnd.Perm(256) { + li := lut[i] + lj := lut[j] + lut[i] = li&nmask | (lj & mask) + lut[j] = lj&nmask | (li & mask) + } + } + } + + fmt.Printf("%#v", lut) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..bf2f5aeeea5eaa9e4972404be2e77c81b578e5f0 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module github.com/ipfs/go-ipfs-chunker + +require ( + github.com/ipfs/go-block-format v0.0.2 + github.com/ipfs/go-ipfs-util v0.0.1 + github.com/ipfs/go-log v0.0.1 + github.com/libp2p/go-buffer-pool v0.0.2 + github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f +) + +go 1.15 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000000000000000000000000000000000000..1e964c59be9573a58e239c9683ee1d57ab62ec51 --- /dev/null +++ b/go.sum @@ -0,0 +1,55 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= +github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= +github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyFSs7UnsU= +github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU= +github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc= +github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48= +github.com/ipfs/go-block-format v0.0.2 h1:qPDvcP19izTjU8rgo6p7gTXZlkMkF5bz5G3fqIsSCPE= +github.com/ipfs/go-block-format v0.0.2/go.mod h1:AWR46JfpcObNfg3ok2JHDUfdiHRgWhJgCQF+KIgOPJY= +github.com/ipfs/go-cid v0.0.1 h1:GBjWPktLnNyX0JiQCNFpUuUSoMw5KMyqrsejHYlILBE= +github.com/ipfs/go-cid v0.0.1/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= +github.com/ipfs/go-ipfs-util v0.0.1 h1:Wz9bL2wB2YBJqggkA4dD7oSmqB4cAnpNbGrlHJulv50= +github.com/ipfs/go-ipfs-util v0.0.1/go.mod h1:spsl5z8KUnrve+73pOhSVZND1SIxPW5RyBCNzQxlJBc= +github.com/ipfs/go-log v0.0.1 h1:9XTUN/rW64BCG1YhPK9Hoy3q8nr4gOmHHBpgFdfw6Lc= +github.com/ipfs/go-log v0.0.1/go.mod h1:kL1d2/hzSpI0thNYjiKfjanbVNU+IIGA/WnNESY9leM= +github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/libp2p/go-buffer-pool v0.0.2 h1:QNK2iAFa8gjAe1SPz6mHSMuCcjs+X1wlHzeOSqcmlfs= +github.com/libp2p/go-buffer-pool v0.0.2/go.mod h1:MvaB6xw5vOrDl8rYZGLFdKAuk/hRoRZd1Vi32+RXyFM= +github.com/mattn/go-colorable v0.1.1 h1:G1f5SKeVxmagw/IyvzvtZE4Gybcc4Tr1tf7I8z0XgOg= +github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= +github.com/mattn/go-isatty v0.0.5 h1:tHXDdz1cpzGaovsTB+TVB8q90WEokoVmfMqoVcrLUgw= +github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1 h1:lYpkrQH5ajf0OXOcUbGjvZxxijuBwbbmlSxLiuofa+g= +github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8RvIylQ358TN4wwqatJ8rNavkEINozVn9DtGI3dfQ= +github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16 h1:5W7KhL8HVF3XCFOweFD3BNESdnO8ewyYTFT2R+/b8FQ= +github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV56Chs1E/U= +github.com/mr-tron/base58 v1.1.0 h1:Y51FGVJ91WBqCEabAi5OPUz38eAx8DakuAm5svLcsfQ= +github.com/mr-tron/base58 v1.1.0/go.mod h1:xcD2VGqlgYjBdcBLw+TuYLr8afG+Hj8g2eTVqeSzSU8= +github.com/multiformats/go-base32 v0.0.3 h1:tw5+NhuwaOjJCC5Pp82QuXbrmLzWg7uxlMFp8Nq/kkI= +github.com/multiformats/go-base32 v0.0.3/go.mod h1:pLiuGC8y0QR3Ue4Zug5UzK9LjgbkL8NSQj0zQ5Nz/AA= +github.com/multiformats/go-multibase v0.0.1 h1:PN9/v21eLywrFWdFNsFKaU04kLJzuYzmrJR+ubhT9qA= +github.com/multiformats/go-multibase v0.0.1/go.mod h1:bja2MqRZ3ggyXtZSEDKpl0uO/gviWFaSteVbWT51qgs= +github.com/multiformats/go-multihash v0.0.1 h1:HHwN1K12I+XllBCrqKnhX949Orn4oawPkegHMu2vDqQ= +github.com/multiformats/go-multihash v0.0.1/go.mod h1:w/5tugSrLEbWqlcgJabL3oHFKTwfvkofsjW2Qa1ct4U= +github.com/opentracing/opentracing-go v1.0.2 h1:3jA2P6O1F9UOrWVpwrIo17pu01KWvNWg4X946/Y5Zwg= +github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f h1:jQa4QT2UP9WYv2nzyawpKMOCl+Z/jW7djv2/J50lj9E= +github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f/go.mod h1:p9UJB6dDgdPgMJZs7UjUOdulKyRr9fqkS+6JKAInPy8= +github.com/whyrusleeping/go-logging v0.0.0-20170515211332-0457bb6b88fc h1:9lDbC6Rz4bwmou+oE6Dt4Cb2BGMur5eR/GYptkKUVHo= +github.com/whyrusleeping/go-logging v0.0.0-20170515211332-0457bb6b88fc/go.mod h1:bopw91TMyo8J3tvftk8xmU2kPmlrt4nScJQZU2hE5EM= +golang.org/x/crypto v0.0.0-20190211182817-74369b46fc67 h1:ng3VDlRp5/DHpSWl02R4rM9I+8M2rhmsuLwAMmkLQWE= +golang.org/x/crypto v0.0.0-20190211182817-74369b46fc67/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/net v0.0.0-20190227160552-c95aed5357e7 h1:C2F/nMkR/9sfUTpvR3QrjBuTdvMUC/cFajkphs1YLQo= +golang.org/x/net v0.0.0-20190227160552-c95aed5357e7/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223 h1:DH4skfRX4EBpamg7iV4ZlCpblAHI6s6TDM39bFZumv8= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/parse.go b/parse.go new file mode 100644 index 0000000000000000000000000000000000000000..dee830489aa71ec58d32181f0081ad3057553f71 --- /dev/null +++ b/parse.go @@ -0,0 +1,114 @@ +package chunk + +import ( + "errors" + "fmt" + "io" + "strconv" + "strings" +) + +const ( + // DefaultBlockSize is the chunk size that splitters produce (or aim to). + DefaultBlockSize int64 = 1024 * 256 + + // No leaf block should contain more than 1MiB of payload data ( wrapping overhead aside ) + // This effectively mandates the maximum chunk size + // See discussion at https://github.com/ipfs/go-ipfs-chunker/pull/21#discussion_r369124879 for background + ChunkSizeLimit int = 1048576 +) + +var ( + ErrRabinMin = errors.New("rabin min must be greater than 16") + ErrSize = errors.New("chunker size must be greater than 0") + ErrSizeMax = fmt.Errorf("chunker parameters may not exceed the maximum chunk size of %d", ChunkSizeLimit) +) + +// FromString returns a Splitter depending on the given string: +// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}", +// "rabin-{min}-{avg}-{max}" and "buzhash". +func FromString(r io.Reader, chunker string) (Splitter, error) { + switch { + case chunker == "" || chunker == "default": + return DefaultSplitter(r), nil + + case strings.HasPrefix(chunker, "size-"): + sizeStr := strings.Split(chunker, "-")[1] + size, err := strconv.Atoi(sizeStr) + if err != nil { + return nil, err + } else if size <= 0 { + return nil, ErrSize + } else if size > ChunkSizeLimit { + return nil, ErrSizeMax + } + return NewSizeSplitter(r, int64(size)), nil + + case strings.HasPrefix(chunker, "rabin"): + return parseRabinString(r, chunker) + + case chunker == "buzhash": + return NewBuzhash(r), nil + + default: + return nil, fmt.Errorf("unrecognized chunker option: %s", chunker) + } +} + +func parseRabinString(r io.Reader, chunker string) (Splitter, error) { + parts := strings.Split(chunker, "-") + switch len(parts) { + case 1: + return NewRabin(r, uint64(DefaultBlockSize)), nil + case 2: + size, err := strconv.Atoi(parts[1]) + if err != nil { + return nil, err + } else if int(float32(size)*1.5) > ChunkSizeLimit { // FIXME - this will be addressed in a subsequent PR + return nil, ErrSizeMax + } + return NewRabin(r, uint64(size)), nil + case 4: + sub := strings.Split(parts[1], ":") + if len(sub) > 1 && sub[0] != "min" { + return nil, errors.New("first label must be min") + } + min, err := strconv.Atoi(sub[len(sub)-1]) + if err != nil { + return nil, err + } + if min < 16 { + return nil, ErrRabinMin + } + sub = strings.Split(parts[2], ":") + if len(sub) > 1 && sub[0] != "avg" { + log.Error("sub == ", sub) + return nil, errors.New("second label must be avg") + } + avg, err := strconv.Atoi(sub[len(sub)-1]) + if err != nil { + return nil, err + } + + sub = strings.Split(parts[3], ":") + if len(sub) > 1 && sub[0] != "max" { + return nil, errors.New("final label must be max") + } + max, err := strconv.Atoi(sub[len(sub)-1]) + if err != nil { + return nil, err + } + + if min >= avg { + return nil, errors.New("incorrect format: rabin-min must be smaller than rabin-avg") + } else if avg >= max { + return nil, errors.New("incorrect format: rabin-avg must be smaller than rabin-max") + } else if max > ChunkSizeLimit { + return nil, ErrSizeMax + } + + return NewRabinMinMax(r, uint64(min), uint64(avg), uint64(max)), nil + default: + return nil, errors.New("incorrect format (expected 'rabin' 'rabin-[avg]' or 'rabin-[min]-[avg]-[max]'") + } +} diff --git a/parse_test.go b/parse_test.go new file mode 100644 index 0000000000000000000000000000000000000000..237a2b439a6cc74a0a83a3b63da3affa0e9cee87 --- /dev/null +++ b/parse_test.go @@ -0,0 +1,80 @@ +package chunk + +import ( + "bytes" + "fmt" + "testing" +) + +const ( + testTwoThirdsOfChunkLimit = 2 * (float32(ChunkSizeLimit) / float32(3)) +) + +func TestParseRabin(t *testing.T) { + r := bytes.NewReader(randBuf(t, 1000)) + + _, err := FromString(r, "rabin-18-25-32") + if err != nil { + t.Errorf(err.Error()) + } + + _, err = FromString(r, "rabin-15-23-31") + if err != ErrRabinMin { + t.Fatalf("Expected an 'ErrRabinMin' error, got: %#v", err) + } + + _, err = FromString(r, "rabin-20-20-21") + if err == nil || err.Error() != "incorrect format: rabin-min must be smaller than rabin-avg" { + t.Fatalf("Expected an arg-out-of-order error, got: %#v", err) + } + + _, err = FromString(r, "rabin-19-21-21") + if err == nil || err.Error() != "incorrect format: rabin-avg must be smaller than rabin-max" { + t.Fatalf("Expected an arg-out-of-order error, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-19-21-%d", ChunkSizeLimit)) + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-19-21-%d", 1+ChunkSizeLimit)) + if err != ErrSizeMax { + t.Fatalf("Expected 'ErrSizeMax', got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-%.0f", testTwoThirdsOfChunkLimit)) + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-%.0f", 1+testTwoThirdsOfChunkLimit)) + if err != ErrSizeMax { + t.Fatalf("Expected 'ErrSizeMax', got: %#v", err) + } + +} + +func TestParseSize(t *testing.T) { + r := bytes.NewReader(randBuf(t, 1000)) + + _, err := FromString(r, "size-0") + if err != ErrSize { + t.Fatalf("Expected an 'ErrSize' error, got: %#v", err) + } + + _, err = FromString(r, "size-32") + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("size-%d", ChunkSizeLimit)) + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("size-%d", 1+ChunkSizeLimit)) + if err != ErrSizeMax { + t.Fatalf("Expected 'ErrSizeMax', got: %#v", err) + } +} diff --git a/rabin.go b/rabin.go new file mode 100644 index 0000000000000000000000000000000000000000..4247057b2f69dd9c44a0792559e40f138313bc79 --- /dev/null +++ b/rabin.go @@ -0,0 +1,54 @@ +package chunk + +import ( + "hash/fnv" + "io" + + "github.com/whyrusleeping/chunker" +) + +// IpfsRabinPoly is the irreducible polynomial of degree 53 used by for Rabin. +var IpfsRabinPoly = chunker.Pol(17437180132763653) + +// Rabin implements the Splitter interface and splits content with Rabin +// fingerprints. +type Rabin struct { + r *chunker.Chunker + reader io.Reader +} + +// NewRabin creates a new Rabin splitter with the given +// average block size. +func NewRabin(r io.Reader, avgBlkSize uint64) *Rabin { + min := avgBlkSize / 3 + max := avgBlkSize + (avgBlkSize / 2) + + return NewRabinMinMax(r, min, avgBlkSize, max) +} + +// NewRabinMinMax returns a new Rabin splitter which uses +// the given min, average and max block sizes. +func NewRabinMinMax(r io.Reader, min, avg, max uint64) *Rabin { + h := fnv.New32a() + ch := chunker.New(r, IpfsRabinPoly, h, avg, min, max) + + return &Rabin{ + r: ch, + reader: r, + } +} + +// NextBytes reads the next bytes from the reader and returns a slice. +func (r *Rabin) NextBytes() ([]byte, error) { + ch, err := r.r.Next() + if err != nil { + return nil, err + } + + return ch.Data, nil +} + +// Reader returns the io.Reader associated to this Splitter. +func (r *Rabin) Reader() io.Reader { + return r.reader +} diff --git a/rabin_test.go b/rabin_test.go new file mode 100644 index 0000000000000000000000000000000000000000..2e19a82c74b25c5b0f38d329f11ce31f5822bc37 --- /dev/null +++ b/rabin_test.go @@ -0,0 +1,108 @@ +package chunk + +import ( + "bytes" + "fmt" + "io" + "testing" + + blocks "github.com/ipfs/go-block-format" + util "github.com/ipfs/go-ipfs-util" +) + +func TestRabinChunking(t *testing.T) { + data := make([]byte, 1024*1024*16) + n, err := util.NewTimeSeededRand().Read(data) + if n < len(data) { + t.Fatalf("expected %d bytes, got %d", len(data), n) + } + if err != nil { + t.Fatal(err) + } + + r := NewRabin(bytes.NewReader(data), 1024*256) + + var chunks [][]byte + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + chunks = append(chunks, chunk) + } + + fmt.Printf("average block size: %d\n", len(data)/len(chunks)) + + unchunked := bytes.Join(chunks, nil) + if !bytes.Equal(unchunked, data) { + fmt.Printf("%d %d\n", len(unchunked), len(data)) + t.Fatal("data was chunked incorrectly") + } +} + +func chunkData(t *testing.T, newC newSplitter, data []byte) map[string]blocks.Block { + r := newC(bytes.NewReader(data)) + + blkmap := make(map[string]blocks.Block) + + for { + blk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + b := blocks.NewBlock(blk) + blkmap[b.Cid().KeyString()] = b + } + + return blkmap +} + +func testReuse(t *testing.T, cr newSplitter) { + data := make([]byte, 1024*1024*16) + n, err := util.NewTimeSeededRand().Read(data) + if n < len(data) { + t.Fatalf("expected %d bytes, got %d", len(data), n) + } + if err != nil { + t.Fatal(err) + } + + ch1 := chunkData(t, cr, data[1000:]) + ch2 := chunkData(t, cr, data) + + var extra int + for k := range ch2 { + _, ok := ch1[k] + if !ok { + extra++ + } + } + + if extra > 2 { + t.Logf("too many spare chunks made: %d", extra) + } +} + +func TestRabinChunkReuse(t *testing.T) { + newRabin := func(r io.Reader) Splitter { + return NewRabin(r, 256*1024) + } + testReuse(t, newRabin) +} + +var Res uint64 + +func BenchmarkRabin(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return NewRabin(r, 256<<10) + }) +} diff --git a/splitting.go b/splitting.go new file mode 100644 index 0000000000000000000000000000000000000000..a137820ab11cb610ff010b8b6b50c496c0cf5d60 --- /dev/null +++ b/splitting.go @@ -0,0 +1,102 @@ +// Package chunk implements streaming block splitters. +// Splitters read data from a reader and provide byte slices (chunks) +// The size and contents of these slices depend on the splitting method +// used. +package chunk + +import ( + "io" + + logging "github.com/ipfs/go-log" + pool "github.com/libp2p/go-buffer-pool" +) + +var log = logging.Logger("chunk") + +// A Splitter reads bytes from a Reader and creates "chunks" (byte slices) +// that can be used to build DAG nodes. +type Splitter interface { + Reader() io.Reader + NextBytes() ([]byte, error) +} + +// SplitterGen is a splitter generator, given a reader. +type SplitterGen func(r io.Reader) Splitter + +// DefaultSplitter returns a SizeSplitter with the DefaultBlockSize. +func DefaultSplitter(r io.Reader) Splitter { + return NewSizeSplitter(r, DefaultBlockSize) +} + +// SizeSplitterGen returns a SplitterGen function which will create +// a splitter with the given size when called. +func SizeSplitterGen(size int64) SplitterGen { + return func(r io.Reader) Splitter { + return NewSizeSplitter(r, size) + } +} + +// Chan returns a channel that receives each of the chunks produced +// by a splitter, along with another one for errors. +func Chan(s Splitter) (<-chan []byte, <-chan error) { + out := make(chan []byte) + errs := make(chan error, 1) + go func() { + defer close(out) + defer close(errs) + + // all-chunks loop (keep creating chunks) + for { + b, err := s.NextBytes() + if err != nil { + errs <- err + return + } + + out <- b + } + }() + return out, errs +} + +type sizeSplitterv2 struct { + r io.Reader + size uint32 + err error +} + +// NewSizeSplitter returns a new size-based Splitter with the given block size. +func NewSizeSplitter(r io.Reader, size int64) Splitter { + return &sizeSplitterv2{ + r: r, + size: uint32(size), + } +} + +// NextBytes produces a new chunk. +func (ss *sizeSplitterv2) NextBytes() ([]byte, error) { + if ss.err != nil { + return nil, ss.err + } + + full := pool.Get(int(ss.size)) + n, err := io.ReadFull(ss.r, full) + switch err { + case io.ErrUnexpectedEOF: + ss.err = io.EOF + small := make([]byte, n) + copy(small, full) + pool.Put(full) + return small, nil + case nil: + return full, nil + default: + pool.Put(full) + return nil, err + } +} + +// Reader returns the io.Reader associated to this Splitter. +func (ss *sizeSplitterv2) Reader() io.Reader { + return ss.r +} diff --git a/splitting_test.go b/splitting_test.go new file mode 100644 index 0000000000000000000000000000000000000000..d6498fcbedcf7246531775ba2d5b324fb79d5770 --- /dev/null +++ b/splitting_test.go @@ -0,0 +1,126 @@ +package chunk + +import ( + "bytes" + "io" + "testing" + + u "github.com/ipfs/go-ipfs-util" +) + +func randBuf(t *testing.T, size int) []byte { + buf := make([]byte, size) + if _, err := u.NewTimeSeededRand().Read(buf); err != nil { + t.Fatal("failed to read enough randomness") + } + return buf +} + +func copyBuf(buf []byte) []byte { + cpy := make([]byte, len(buf)) + copy(cpy, buf) + return cpy +} + +func TestSizeSplitterOverAllocate(t *testing.T) { + max := 1000 + r := bytes.NewReader(randBuf(t, max)) + chunksize := int64(1024 * 256) + splitter := NewSizeSplitter(r, chunksize) + chunk, err := splitter.NextBytes() + if err != nil { + t.Fatal(err) + } + if cap(chunk) > len(chunk) { + t.Fatal("chunk capacity too large") + } +} + +func TestSizeSplitterIsDeterministic(t *testing.T) { + if testing.Short() { + t.SkipNow() + } + + test := func() { + bufR := randBuf(t, 10000000) // crank this up to satisfy yourself. + bufA := copyBuf(bufR) + bufB := copyBuf(bufR) + + chunksA, _ := Chan(DefaultSplitter(bytes.NewReader(bufA))) + chunksB, _ := Chan(DefaultSplitter(bytes.NewReader(bufB))) + + for n := 0; ; n++ { + a, moreA := <-chunksA + b, moreB := <-chunksB + + if !moreA { + if moreB { + t.Fatal("A ended, B didnt.") + } + return + } + + if !bytes.Equal(a, b) { + t.Fatalf("chunk %d not equal", n) + } + } + } + + for run := 0; run < 1; run++ { // crank this up to satisfy yourself. + test() + } +} + +func TestSizeSplitterFillsChunks(t *testing.T) { + if testing.Short() { + t.SkipNow() + } + + max := 10000000 + b := randBuf(t, max) + r := &clipReader{r: bytes.NewReader(b), size: 4000} + chunksize := int64(1024 * 256) + c, _ := Chan(NewSizeSplitter(r, chunksize)) + + sofar := 0 + whole := make([]byte, max) + for chunk := range c { + + bc := b[sofar : sofar+len(chunk)] + if !bytes.Equal(bc, chunk) { + t.Fatalf("chunk not correct: (sofar: %d) %d != %d, %v != %v", sofar, len(bc), len(chunk), bc[:100], chunk[:100]) + } + + copy(whole[sofar:], chunk) + + sofar += len(chunk) + if sofar != max && len(chunk) < int(chunksize) { + t.Fatal("sizesplitter split at a smaller size") + } + } + + if !bytes.Equal(b, whole) { + t.Fatal("splitter did not split right") + } +} + +type clipReader struct { + size int + r io.Reader +} + +func (s *clipReader) Read(buf []byte) (int, error) { + + // clip the incoming buffer to produce smaller chunks + if len(buf) > s.size { + buf = buf[:s.size] + } + + return s.r.Read(buf) +} + +func BenchmarkDefault(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return DefaultSplitter(r) + }) +}