diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..4b86d7197f9a6f1a988c503defc8451392cd5e55 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Getting Help on IPFS + url: https://ipfs.io/help + about: All information about how and where to get help on IPFS. + - name: IPFS Official Forum + url: https://discuss.ipfs.io + about: Please post general questions, support requests, and discussions here. diff --git a/.github/ISSUE_TEMPLATE/open_an_issue.md b/.github/ISSUE_TEMPLATE/open_an_issue.md new file mode 100644 index 0000000000000000000000000000000000000000..4fcbd00aca0d513c2729d8df4afb5a01fdbe7d02 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/open_an_issue.md @@ -0,0 +1,19 @@ +--- +name: Open an issue +about: Only for actionable issues relevant to this repository. +title: '' +labels: need/triage +assignees: '' + +--- + diff --git a/.github/config.yml b/.github/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..ed26646a0f7cdda6cf10ede2b0b98cac89cf67b0 --- /dev/null +++ b/.github/config.yml @@ -0,0 +1,68 @@ +# Configuration for welcome - https://github.com/behaviorbot/welcome + +# Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome +# Comment to be posted to on first time issues +newIssueWelcomeComment: > + Thank you for submitting your first issue to this repository! A maintainer + will be here shortly to triage and review. + + In the meantime, please double-check that you have provided all the + necessary information to make this process easy! Any information that can + help save additional round trips is useful! We currently aim to give + initial feedback within **two business days**. If this does not happen, feel + free to leave a comment. + + Please keep an eye on how this issue will be labeled, as labels give an + overview of priorities, assignments and additional actions requested by the + maintainers: + + - "Priority" labels will show how urgent this is for the team. + - "Status" labels will show if this is ready to be worked on, blocked, or in progress. + - "Need" labels will indicate if additional input or analysis is required. + + Finally, remember to use https://discuss.ipfs.io if you just need general + support. + +# Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome +# Comment to be posted to on PRs from first time contributors in your repository +newPRWelcomeComment: > + Thank you for submitting this PR! + + A maintainer will be here shortly to review it. + + We are super grateful, but we are also overloaded! Help us by making sure + that: + + * The context for this PR is clear, with relevant discussion, decisions + and stakeholders linked/mentioned. + + * Your contribution itself is clear (code comments, self-review for the + rest) and in its best form. Follow the [code contribution + guidelines](https://github.com/ipfs/community/blob/master/CONTRIBUTING.md#code-contribution-guidelines) + if they apply. + + Getting other community members to do a review would be great help too on + complex PRs (you can ask in the chats/forums). If you are unsure about + something, just leave us a comment. + + Next steps: + + * A maintainer will triage and assign priority to this PR, commenting on + any missing things and potentially assigning a reviewer for high + priority items. + + * The PR gets reviews, discussed and approvals as needed. + + * The PR is merged by maintainers when it has been approved and comments addressed. + + We currently aim to provide initial feedback/triaging within **two business + days**. Please keep an eye on any labelling actions, as these will indicate + priorities and status of your contribution. + + We are very grateful for your contribution! + + +# Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge +# Comment to be posted to on pull requests merged by a first time user +# Currently disabled +#firstPRMergeComment: "" diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml new file mode 100644 index 0000000000000000000000000000000000000000..840618d88f2fdc33972facbed16a84929643bf6f --- /dev/null +++ b/.github/workflows/automerge.yml @@ -0,0 +1,27 @@ +# File managed by web3-bot. DO NOT EDIT. +# See https://github.com/protocol/.github/ for details. + +# Automatically merge pull requests opened by web3-bot, as soon as (and only if) all tests pass. +# This reduces the friction associated with updating with our workflows. + +on: [ pull_request ] + +jobs: + automerge: + if: github.event.pull_request.user.login == 'web3-bot' + runs-on: ubuntu-latest + steps: + - name: Wait on tests + uses: lewagon/wait-on-check-action@bafe56a6863672c681c3cf671f5e10b20abf2eaa # v0.2 + with: + ref: ${{ github.event.pull_request.head.sha }} + repo-token: ${{ secrets.GITHUB_TOKEN }} + wait-interval: 10 + running-workflow-name: 'automerge' # the name of this job + - name: Merge PR + uses: pascalgn/automerge-action@741c311a47881be9625932b0a0de1b0937aab1ae # v0.13.1 + env: + GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + MERGE_LABELS: "" + MERGE_METHOD: "squash" + MERGE_DELETE_BRANCH: true diff --git a/.github/workflows/go-check.yml b/.github/workflows/go-check.yml new file mode 100644 index 0000000000000000000000000000000000000000..862d49fc9f3443177b1d3e78e158f24340a3e980 --- /dev/null +++ b/.github/workflows/go-check.yml @@ -0,0 +1,41 @@ +# File managed by web3-bot. DO NOT EDIT. +# See https://github.com/protocol/.github/ for details. + +on: [push, pull_request] + +jobs: + unit: + runs-on: ubuntu-latest + name: Go checks + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: "1.16.x" + - name: Install staticcheck + run: go install honnef.co/go/tools/cmd/staticcheck@be534f007836a777104a15f2456cd1fffd3ddee8 # v2020.2.2 + - name: Check that go.mod is tidy + run: | + go mod tidy + if [[ -n $(git ls-files --other --exclude-standard --directory -- go.sum) ]]; then + echo "go.sum was added by go mod tidy" + exit 1 + fi + git diff --exit-code -- go.sum go.mod + - name: gofmt + if: ${{ success() || failure() }} # run this step even if the previous one failed + run: | + out=$(gofmt -s -l .) + if [[ -n "$out" ]]; then + echo $out | awk '{print "::error file=" $0 ",line=0,col=0::File is not gofmt-ed."}' + exit 1 + fi + - name: go vet + if: ${{ success() || failure() }} # run this step even if the previous one failed + run: go vet ./... + - name: staticcheck + if: ${{ success() || failure() }} # run this step even if the previous one failed + run: | + set -o pipefail + staticcheck ./... | sed -e 's@\(.*\)\.go@./\1.go@g' + diff --git a/.github/workflows/go-test.yml b/.github/workflows/go-test.yml new file mode 100644 index 0000000000000000000000000000000000000000..9b384208624b2615489e028c4cec15efd1557ee3 --- /dev/null +++ b/.github/workflows/go-test.yml @@ -0,0 +1,38 @@ +# File managed by web3-bot. DO NOT EDIT. +# See https://github.com/protocol/.github/ for details. + +on: [push, pull_request] + +jobs: + unit: + strategy: + fail-fast: false + matrix: + os: [ "ubuntu", "windows", "macos" ] + go: [ "1.15.x", "1.16.x" ] + runs-on: ${{ matrix.os }}-latest + name: Unit tests (${{ matrix.os}}, Go ${{ matrix.go }}) + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go }} + - name: Go information + run: | + go version + go env + - name: Run tests + run: go test -v -coverprofile coverage.txt ./... + - name: Run tests (32 bit) + if: ${{ matrix.os != 'macos' }} # can't run 32 bit tests on OSX. + env: + GOARCH: 386 + run: go test -v ./... + - name: Run tests with race detector + if: ${{ matrix.os == 'ubuntu' }} # speed things up. Windows and OSX VMs are slow + run: go test -v -race ./... + - name: Upload coverage to Codecov + uses: codecov/codecov-action@967e2b38a85a62bd61be5529ada27ebc109948c2 # v1.4.1 + with: + file: coverage.txt + env_vars: OS=${{ matrix.os }}, GO=${{ matrix.go }} diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..dfab88961c90fd41f90c5ab42b03a552ca342c2d --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,35 @@ +stages: + - build + - test + +variables: + BUILD_DIR: "/tmp/$CI_CONCURRENT_PROJECT_ID" + +before_script: + - mkdir -p $BUILD_DIR/src + - cd $BUILD_DIR/src + - if [ -d $CI_PROJECT_DIR ] + - then + - echo "soft link $CI_PROJECT_DIR exists" + - else + - echo "creating soft link $CI_PROJECT_DIR" + - ln -s $CI_PROJECT_DIR + - fi + - cd $CI_PROJECT_DIR + +build: + stage: build + tags: + - testing + script: + - echo $CI_JOB_STAGE + - go build + +test: + stage: test + tags: + - testing + script: + - echo $CI_JOB_STAGE + - go test -cover + coverage: '/coverage: \d+.\d+% of statements/' diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e4224df5b7a109818bddc7862137f495f2c2b82e --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 IPFS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 2bb06ebc56a7c6d4fd41a3d0c682330b5025c7a8..3b4024b802d28143c7a4f7d8a9208ff2bfda3e13 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,50 @@ # go-dms3-chunker +[![](https://img.shields.io/badge/made%20by-Protocol%20Labs-blue.svg?style=flat-square)](http://ipn.io) +[![](https://img.shields.io/badge/project-IPFS-blue.svg?style=flat-square)](http://ipfs.io/) +[![standard-readme compliant](https://img.shields.io/badge/standard--readme-OK-green.svg?style=flat-square)](https://github.com/RichardLitt/standard-readme) +[![GoDoc](https://godoc.org/github.com/ipfs/go-ipfs-chunker?status.svg)](https://godoc.org/github.com/ipfs/go-ipfs-chunker) +[![Build Status](https://travis-ci.org/ipfs/go-ipfs-chunker.svg?branch=master)](https://travis-ci.org/ipfs/go-ipfs-chunker) + +> go-ipfs-chunker implements data Splitters for go-ipfs. + +`go-ipfs-chunker` provides the `Splitter` interface. IPFS splitters read data from a reader an create "chunks". These chunks are used to build the ipfs DAGs (Merkle Tree) and are the base unit to obtain the sums that ipfs uses to address content. + +The package provides a `SizeSplitter` which creates chunks of equal size and it is used by default in most cases, and a `rabin` fingerprint chunker. This chunker will attempt to split data in a way that the resulting blocks are the same when the data has repetitive patterns, thus optimizing the resulting DAGs. + +## Lead Maintainer + +[Steven Allen](https://github.com/Stebalien) + +## Table of Contents + +- [Install](#install) +- [Usage](#usage) +- [Contribute](#contribute) +- [License](#license) + +## Install + +`go-ipfs-chunker` works like a regular Go module: + +``` +> go get github.com/ipfs/go-ipfs-chunker +``` + +## Usage + +``` +import "github.com/ipfs/go-ipfs-chunker" +``` + +Check the [GoDoc documentation](https://godoc.org/github.com/ipfs/go-ipfs-chunker) + +## Contribute + +PRs accepted. + +Small note: If editing the README, please conform to the [standard-readme](https://github.com/RichardLitt/standard-readme) specification. + +## License + +MIT © Protocol Labs, Inc. diff --git a/benchmark_test.go b/benchmark_test.go new file mode 100644 index 0000000000000000000000000000000000000000..5069b06536e15084cb7ae68d2d52792719ed4b30 --- /dev/null +++ b/benchmark_test.go @@ -0,0 +1,59 @@ +package chunk + +import ( + "bytes" + "io" + "math/rand" + "testing" +) + +type newSplitter func(io.Reader) Splitter + +type bencSpec struct { + size int + name string +} + +var bSizes = []bencSpec{ + {1 << 10, "1K"}, + {1 << 20, "1M"}, + {16 << 20, "16M"}, + {100 << 20, "100M"}, +} + +func benchmarkChunker(b *testing.B, ns newSplitter) { + for _, s := range bSizes { + s := s + b.Run(s.name, func(b *testing.B) { + benchmarkChunkerSize(b, ns, s.size) + }) + } +} + +func benchmarkChunkerSize(b *testing.B, ns newSplitter, size int) { + rng := rand.New(rand.NewSource(1)) + data := make([]byte, size) + rng.Read(data) + + b.SetBytes(int64(size)) + b.ReportAllocs() + b.ResetTimer() + + var res uint64 + + for i := 0; i < b.N; i++ { + r := ns(bytes.NewReader(data)) + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + b.Fatal(err) + } + res = res + uint64(len(chunk)) + } + } + Res = Res + res +} diff --git a/buzhash.go b/buzhash.go new file mode 100644 index 0000000000000000000000000000000000000000..83ab019dd48c257e9eb2199a56514646209f838e --- /dev/null +++ b/buzhash.go @@ -0,0 +1,151 @@ +package chunk + +import ( + "io" + "math/bits" + + pool "github.com/libp2p/go-buffer-pool" +) + +const ( + buzMin = 128 << 10 + buzMax = 512 << 10 + buzMask = 1<<17 - 1 +) + +type Buzhash struct { + r io.Reader + buf []byte + n int + + err error +} + +func NewBuzhash(r io.Reader) *Buzhash { + return &Buzhash{ + r: r, + buf: pool.Get(buzMax), + } +} + +func (b *Buzhash) Reader() io.Reader { + return b.r +} + +func (b *Buzhash) NextBytes() ([]byte, error) { + if b.err != nil { + return nil, b.err + } + + n, err := io.ReadFull(b.r, b.buf[b.n:]) + if err != nil { + if err == io.ErrUnexpectedEOF || err == io.EOF { + buffered := b.n + n + if buffered < buzMin { + b.err = io.EOF + // Read nothing? Don't return an empty block. + if buffered == 0 { + pool.Put(b.buf) + b.buf = nil + return nil, b.err + } + res := make([]byte, buffered) + copy(res, b.buf) + + pool.Put(b.buf) + b.buf = nil + return res, nil + } + } else { + b.err = err + pool.Put(b.buf) + b.buf = nil + return nil, err + } + } + + i := buzMin - 32 + + var state uint32 = 0 + + if buzMin > len(b.buf) { + panic("this is impossible") + } + + for ; i < buzMin; i++ { + state = bits.RotateLeft32(state, 1) + state = state ^ bytehash[b.buf[i]] + } + + { + max := b.n + n - 32 - 1 + + buf := b.buf + bufshf := b.buf[32:] + i = buzMin - 32 + _ = buf[max] + _ = bufshf[max] + + for ; i <= max; i++ { + if state&buzMask == 0 { + break + } + state = bits.RotateLeft32(state, 1) ^ + bytehash[buf[i]] ^ + bytehash[bufshf[i]] + } + i += 32 + } + + res := make([]byte, i) + copy(res, b.buf) + + b.n = copy(b.buf, b.buf[i:b.n+n]) + + return res, nil +} + +var bytehash = [256]uint32{ + 0x6236e7d5, 0x10279b0b, 0x72818182, 0xdc526514, 0x2fd41e3d, 0x777ef8c8, + 0x83ee5285, 0x2c8f3637, 0x2f049c1a, 0x57df9791, 0x9207151f, 0x9b544818, + 0x74eef658, 0x2028ca60, 0x0271d91a, 0x27ae587e, 0xecf9fa5f, 0x236e71cd, + 0xf43a8a2e, 0xbb13380, 0x9e57912c, 0x89a26cdb, 0x9fcf3d71, 0xa86da6f1, + 0x9c49f376, 0x346aecc7, 0xf094a9ee, 0xea99e9cb, 0xb01713c6, 0x88acffb, + 0x2960a0fb, 0x344a626c, 0x7ff22a46, 0x6d7a1aa5, 0x6a714916, 0x41d454ca, + 0x8325b830, 0xb65f563, 0x447fecca, 0xf9d0ea5e, 0xc1d9d3d4, 0xcb5ec574, + 0x55aae902, 0x86edc0e7, 0xd3a9e33, 0xe70dc1e1, 0xe3c5f639, 0x9b43140a, + 0xc6490ac5, 0x5e4030fb, 0x8e976dd5, 0xa87468ea, 0xf830ef6f, 0xcc1ed5a5, + 0x611f4e78, 0xddd11905, 0xf2613904, 0x566c67b9, 0x905a5ccc, 0x7b37b3a4, + 0x4b53898a, 0x6b8fd29d, 0xaad81575, 0x511be414, 0x3cfac1e7, 0x8029a179, + 0xd40efeda, 0x7380e02, 0xdc9beffd, 0x2d049082, 0x99bc7831, 0xff5002a8, + 0x21ce7646, 0x1cd049b, 0xf43994f, 0xc3c6c5a5, 0xbbda5f50, 0xec15ec7, + 0x9adb19b6, 0xc1e80b9, 0xb9b52968, 0xae162419, 0x2542b405, 0x91a42e9d, + 0x6be0f668, 0x6ed7a6b9, 0xbc2777b4, 0xe162ce56, 0x4266aad5, 0x60fdb704, + 0x66f832a5, 0x9595f6ca, 0xfee83ced, 0x55228d99, 0x12bf0e28, 0x66896459, + 0x789afda, 0x282baa8, 0x2367a343, 0x591491b0, 0x2ff1a4b1, 0x410739b6, + 0x9b7055a0, 0x2e0eb229, 0x24fc8252, 0x3327d3df, 0xb0782669, 0x1c62e069, + 0x7f503101, 0xf50593ae, 0xd9eb275d, 0xe00eb678, 0x5917ccde, 0x97b9660a, + 0xdd06202d, 0xed229e22, 0xa9c735bf, 0xd6316fe6, 0x6fc72e4c, 0x206dfa2, + 0xd6b15c5a, 0x69d87b49, 0x9c97745, 0x13445d61, 0x35a975aa, 0x859aa9b9, + 0x65380013, 0xd1fb6391, 0xc29255fd, 0x784a3b91, 0xb9e74c26, 0x63ce4d40, + 0xc07cbe9e, 0xe6e4529e, 0xfb3632f, 0x9438d9c9, 0x682f94a8, 0xf8fd4611, + 0x257ec1ed, 0x475ce3d6, 0x60ee2db1, 0x2afab002, 0x2b9e4878, 0x86b340de, + 0x1482fdca, 0xfe41b3bf, 0xd4a412b0, 0xe09db98c, 0xc1af5d53, 0x7e55e25f, + 0xd3346b38, 0xb7a12cbd, 0x9c6827ba, 0x71f78bee, 0x8c3a0f52, 0x150491b0, + 0xf26de912, 0x233e3a4e, 0xd309ebba, 0xa0a9e0ff, 0xca2b5921, 0xeeb9893c, + 0x33829e88, 0x9870cc2a, 0x23c4b9d0, 0xeba32ea3, 0xbdac4d22, 0x3bc8c44c, + 0x1e8d0397, 0xf9327735, 0x783b009f, 0xeb83742, 0x2621dc71, 0xed017d03, + 0x5c760aa1, 0x5a69814b, 0x96e3047f, 0xa93c9cde, 0x615c86f5, 0xb4322aa5, + 0x4225534d, 0xd2e2de3, 0xccfccc4b, 0xbac2a57, 0xf0a06d04, 0xbc78d737, + 0xf2d1f766, 0xf5a7953c, 0xbcdfda85, 0x5213b7d5, 0xbce8a328, 0xd38f5f18, + 0xdb094244, 0xfe571253, 0x317fa7ee, 0x4a324f43, 0x3ffc39d9, 0x51b3fa8e, + 0x7a4bee9f, 0x78bbc682, 0x9f5c0350, 0x2fe286c, 0x245ab686, 0xed6bf7d7, + 0xac4988a, 0x3fe010fa, 0xc65fe369, 0xa45749cb, 0x2b84e537, 0xde9ff363, + 0x20540f9a, 0xaa8c9b34, 0x5bc476b3, 0x1d574bd7, 0x929100ad, 0x4721de4d, + 0x27df1b05, 0x58b18546, 0xb7e76764, 0xdf904e58, 0x97af57a1, 0xbd4dc433, + 0xa6256dfd, 0xf63998f3, 0xf1e05833, 0xe20acf26, 0xf57fd9d6, 0x90300b4d, + 0x89df4290, 0x68d01cbc, 0xcf893ee3, 0xcc42a046, 0x778e181b, 0x67265c76, + 0xe981a4c4, 0x82991da1, 0x708f7294, 0xe6e2ae62, 0xfc441870, 0x95e1b0b6, + 0x445f825, 0x5a93b47f, 0x5e9cf4be, 0x84da71e7, 0x9d9582b0, 0x9bf835ef, + 0x591f61e2, 0x43325985, 0x5d2de32e, 0x8d8fbf0f, 0x95b30f38, 0x7ad5b6e, + 0x4e934edf, 0x3cd4990e, 0x9053e259, 0x5c41857d} diff --git a/buzhash_norace_test.go b/buzhash_norace_test.go new file mode 100644 index 0000000000000000000000000000000000000000..2565a4c5338fe3303e529df23871ee652e76bca2 --- /dev/null +++ b/buzhash_norace_test.go @@ -0,0 +1,14 @@ +//+build !race + +package chunk + +import ( + "testing" +) + +func TestFuzzBuzhashChunking(t *testing.T) { + buf := make([]byte, 1024*1024*16) + for i := 0; i < 100; i++ { + testBuzhashChunking(t, buf) + } +} diff --git a/buzhash_test.go b/buzhash_test.go new file mode 100644 index 0000000000000000000000000000000000000000..01a6cda3b5c2a536de7f1fb9d38676c5e9453658 --- /dev/null +++ b/buzhash_test.go @@ -0,0 +1,91 @@ +package chunk + +import ( + "bytes" + "io" + "testing" + + util "gitlab.dms3.io/dms3/public/go-dms3-util" +) + +func testBuzhashChunking(t *testing.T, buf []byte) (chunkCount int) { + n, err := util.NewTimeSeededRand().Read(buf) + if n < len(buf) { + t.Fatalf("expected %d bytes, got %d", len(buf), n) + } + if err != nil { + t.Fatal(err) + } + + r := NewBuzhash(bytes.NewReader(buf)) + + var chunks [][]byte + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + chunks = append(chunks, chunk) + } + chunkCount += len(chunks) + + for i, chunk := range chunks { + if len(chunk) == 0 { + t.Fatalf("chunk %d/%d is empty", i+1, len(chunks)) + } + } + + for i, chunk := range chunks[:len(chunks)-1] { + if len(chunk) < buzMin { + t.Fatalf("chunk %d/%d is less than the minimum size", i+1, len(chunks)) + } + } + + unchunked := bytes.Join(chunks, nil) + if !bytes.Equal(unchunked, buf) { + t.Fatal("data was chunked incorrectly") + } + + return chunkCount +} + +func TestBuzhashChunking(t *testing.T) { + buf := make([]byte, 1024*1024*16) + count := testBuzhashChunking(t, buf) + t.Logf("average block size: %d\n", len(buf)/count) +} + +func TestBuzhashChunkReuse(t *testing.T) { + newBuzhash := func(r io.Reader) Splitter { + return NewBuzhash(r) + } + testReuse(t, newBuzhash) +} + +func BenchmarkBuzhash2(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return NewBuzhash(r) + }) +} + +func TestBuzhashBitsHashBias(t *testing.T) { + counts := make([]byte, 32) + for _, h := range bytehash { + for i := 0; i < 32; i++ { + if h&1 == 1 { + counts[i]++ + } + h = h >> 1 + } + } + for i, c := range counts { + if c != 128 { + t.Errorf("Bit balance in position %d broken, %d ones", i, c) + } + } +} diff --git a/gen/main.go b/gen/main.go new file mode 100644 index 0000000000000000000000000000000000000000..9d908544b8795b5983772430aeb92c67c68aa835 --- /dev/null +++ b/gen/main.go @@ -0,0 +1,33 @@ +// This file generates bytehash LUT +package main + +import ( + "fmt" + "math/rand" +) + +const nRounds = 200 + +func main() { + rnd := rand.New(rand.NewSource(0)) + + lut := make([]uint32, 256) + for i := 0; i < 256/2; i++ { + lut[i] = 1<<32 - 1 + } + + for r := 0; r < nRounds; r++ { + for b := uint32(0); b < 32; b++ { + mask := uint32(1) << b + nmask := ^mask + for i, j := range rnd.Perm(256) { + li := lut[i] + lj := lut[j] + lut[i] = li&nmask | (lj & mask) + lut[j] = lj&nmask | (li & mask) + } + } + } + + fmt.Printf("%#v", lut) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..43f4a9f01d8cb91db9f8fdc5db2ca94018286fe6 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module gitlab.dms3.io/dms3/public/go-dms3-chunker + +require ( + gitlab.dms3.io/dms3/public/go-block-format v0.0.1 + gitlab.dms3.io/dms3/public/go-dms3-util v0.0.1 + gitlab.dms3.io/dms3/public/go-log v0.0.1 + github.com/libp2p/go-buffer-pool v0.0.2 + github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f +) + +go 1.15 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000000000000000000000000000000000000..e76c4ad0db6fbbeefe9e56df70058d998984b856 --- /dev/null +++ b/go.sum @@ -0,0 +1,121 @@ +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE= +github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/gxed/hashland/keccakpg v0.0.1 h1:wrk3uMNaMxbXiHibbPO4S0ymqJMm41WiudyFSs7UnsU= +github.com/gxed/hashland/keccakpg v0.0.1/go.mod h1:kRzw3HkwxFU1mpmPP8v1WyQzwdGfmKFJ6tItnhQ67kU= +github.com/gxed/hashland/murmur3 v0.0.1 h1:SheiaIt0sda5K+8FLz952/1iWS9zrnKsEJaOJu4ZbSc= +github.com/gxed/hashland/murmur3 v0.0.1/go.mod h1:KjXop02n4/ckmZSnY2+HKcLud/tcmvhST0bie/0lS48= +github.com/ipfs/go-block-format v0.0.2 h1:qPDvcP19izTjU8rgo6p7gTXZlkMkF5bz5G3fqIsSCPE= +github.com/ipfs/go-block-format v0.0.2/go.mod h1:AWR46JfpcObNfg3ok2JHDUfdiHRgWhJgCQF+KIgOPJY= +github.com/ipfs/go-cid v0.0.1 h1:GBjWPktLnNyX0JiQCNFpUuUSoMw5KMyqrsejHYlILBE= +github.com/ipfs/go-cid v0.0.1/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= +github.com/ipfs/go-ipfs-util v0.0.1 h1:Wz9bL2wB2YBJqggkA4dD7oSmqB4cAnpNbGrlHJulv50= +github.com/ipfs/go-ipfs-util v0.0.1/go.mod h1:spsl5z8KUnrve+73pOhSVZND1SIxPW5RyBCNzQxlJBc= +github.com/ipfs/go-log v0.0.1 h1:9XTUN/rW64BCG1YhPK9Hoy3q8nr4gOmHHBpgFdfw6Lc= +github.com/ipfs/go-log v0.0.1/go.mod h1:kL1d2/hzSpI0thNYjiKfjanbVNU+IIGA/WnNESY9leM= +github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/libp2p/go-buffer-pool v0.0.2 h1:QNK2iAFa8gjAe1SPz6mHSMuCcjs+X1wlHzeOSqcmlfs= +github.com/libp2p/go-buffer-pool v0.0.2/go.mod h1:MvaB6xw5vOrDl8rYZGLFdKAuk/hRoRZd1Vi32+RXyFM= +github.com/mattn/go-colorable v0.1.1 h1:G1f5SKeVxmagw/IyvzvtZE4Gybcc4Tr1tf7I8z0XgOg= +github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= +github.com/mattn/go-isatty v0.0.5 h1:tHXDdz1cpzGaovsTB+TVB8q90WEokoVmfMqoVcrLUgw= +github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1 h1:lYpkrQH5ajf0OXOcUbGjvZxxijuBwbbmlSxLiuofa+g= +github.com/minio/blake2b-simd v0.0.0-20160723061019-3f5f724cb5b1/go.mod h1:pD8RvIylQ358TN4wwqatJ8rNavkEINozVn9DtGI3dfQ= +github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16 h1:5W7KhL8HVF3XCFOweFD3BNESdnO8ewyYTFT2R+/b8FQ= +github.com/minio/sha256-simd v0.0.0-20190131020904-2d45a736cd16/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV56Chs1E/U= +github.com/minio/sha256-simd v0.1.1-0.20190913151208-6de447530771 h1:MHkK1uRtFbVqvAgvWxafZe54+5uBxLluGylDiKgdhwo= +github.com/minio/sha256-simd v0.1.1-0.20190913151208-6de447530771/go.mod h1:B5e1o+1/KgNmWrSQK08Y6Z1Vb5pwIktudl0J58iy0KM= +github.com/mr-tron/base58 v1.1.0 h1:Y51FGVJ91WBqCEabAi5OPUz38eAx8DakuAm5svLcsfQ= +github.com/mr-tron/base58 v1.1.0/go.mod h1:xcD2VGqlgYjBdcBLw+TuYLr8afG+Hj8g2eTVqeSzSU8= +github.com/mr-tron/base58 v1.1.3 h1:v+sk57XuaCKGXpWtVBX8YJzO7hMGx4Aajh4TQbdEFdc= +github.com/mr-tron/base58 v1.1.3/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc= +github.com/multiformats/go-base32 v0.0.3 h1:tw5+NhuwaOjJCC5Pp82QuXbrmLzWg7uxlMFp8Nq/kkI= +github.com/multiformats/go-base32 v0.0.3/go.mod h1:pLiuGC8y0QR3Ue4Zug5UzK9LjgbkL8NSQj0zQ5Nz/AA= +github.com/multiformats/go-base36 v0.1.0 h1:JR6TyF7JjGd3m6FbLU2cOxhC0Li8z8dLNGQ89tUg4F4= +github.com/multiformats/go-base36 v0.1.0/go.mod h1:kFGE83c6s80PklsHO9sRn2NCoffoRdUUOENyW/Vv6sM= +github.com/multiformats/go-multibase v0.0.1 h1:PN9/v21eLywrFWdFNsFKaU04kLJzuYzmrJR+ubhT9qA= +github.com/multiformats/go-multibase v0.0.1/go.mod h1:bja2MqRZ3ggyXtZSEDKpl0uO/gviWFaSteVbWT51qgs= +github.com/multiformats/go-multibase v0.0.3 h1:l/B6bJDQjvQ5G52jw4QGSYeOTZoAwIO77RblWplfIqk= +github.com/multiformats/go-multibase v0.0.3/go.mod h1:5+1R4eQrT3PkYZ24C3W2Ue2tPwIdYQD509ZjSb5y9Oc= +github.com/multiformats/go-multihash v0.0.1 h1:HHwN1K12I+XllBCrqKnhX949Orn4oawPkegHMu2vDqQ= +github.com/multiformats/go-multihash v0.0.1/go.mod h1:w/5tugSrLEbWqlcgJabL3oHFKTwfvkofsjW2Qa1ct4U= +github.com/multiformats/go-multihash v0.0.13/go.mod h1:VdAWLKTwram9oKAatUcLxBNUjdtcVwxObEQBtRfuyjc= +github.com/multiformats/go-multihash v0.0.14 h1:QoBceQYQQtNUuf6s7wHxnE2c8bhbMqhfGzNI032se/I= +github.com/multiformats/go-multihash v0.0.14/go.mod h1:VdAWLKTwram9oKAatUcLxBNUjdtcVwxObEQBtRfuyjc= +github.com/multiformats/go-varint v0.0.5/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= +github.com/multiformats/go-varint v0.0.6 h1:gk85QWKxh3TazbLxED/NlDVv8+q+ReFJk7Y2W/KhfNY= +github.com/multiformats/go-varint v0.0.6/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= +github.com/opentracing/opentracing-go v1.0.2 h1:3jA2P6O1F9UOrWVpwrIo17pu01KWvNWg4X946/Y5Zwg= +github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f h1:jQa4QT2UP9WYv2nzyawpKMOCl+Z/jW7djv2/J50lj9E= +github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f/go.mod h1:p9UJB6dDgdPgMJZs7UjUOdulKyRr9fqkS+6JKAInPy8= +github.com/whyrusleeping/go-logging v0.0.0-20170515211332-0457bb6b88fc h1:9lDbC6Rz4bwmou+oE6Dt4Cb2BGMur5eR/GYptkKUVHo= +github.com/whyrusleeping/go-logging v0.0.0-20170515211332-0457bb6b88fc/go.mod h1:bopw91TMyo8J3tvftk8xmU2kPmlrt4nScJQZU2hE5EM= +gitlab.dms3.io/dms3/public/go-block-format v0.0.1 h1:PQ6+E7zY6kUIHET86uJTQHTTj4Z9ZNfP7w281ZdExgk= +gitlab.dms3.io/dms3/public/go-block-format v0.0.1/go.mod h1:xlvtW/OF72rOzLa2RVWXX2Uw18qTAWTQEs/Xp7SCnuY= +gitlab.dms3.io/dms3/public/go-cid v0.0.1 h1:qs4dtkDigcLGY/58dIZaFjKLt+orrTcmTBvtqaM3570= +gitlab.dms3.io/dms3/public/go-cid v0.0.1/go.mod h1:GQw3gc4CSrFY+aX6M+OBQDlg0p5/eQJoRrayaZzkAOQ= +gitlab.dms3.io/dms3/public/go-dms3-util v0.0.1 h1:Gd+kJl1Rc+ZEUb9CIS1ZctQnF9G1oruNFyxUC//QBUQ= +gitlab.dms3.io/dms3/public/go-dms3-util v0.0.1/go.mod h1:ymlwtzTNMq8Ug+gVtPAMxXKCKTXwXJAzXS+SUihfKgo= +gitlab.dms3.io/dms3/public/go-log v0.0.1 h1:jqz2g8pVdPW+Sy8CCo4rYfGEjktGhCBfgIb3oeY6yx8= +gitlab.dms3.io/dms3/public/go-log v0.0.1/go.mod h1:OsyF7lVYe47r03v1ZCbrmz0byeGUWB0Y219jN1DJx3s= +go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= +go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= +go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= +go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= +go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= +go.uber.org/zap v1.16.0 h1:uFRZXykJGK9lLY4HtgSw44DnIcAM+kRBP7x5m+NpAOM= +go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ= +golang.org/x/crypto v0.0.0-20190211182817-74369b46fc67 h1:ng3VDlRp5/DHpSWl02R4rM9I+8M2rhmsuLwAMmkLQWE= +golang.org/x/crypto v0.0.0-20190211182817-74369b46fc67/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8 h1:1wopBVtVdWnn03fZelqdXTqk7U7zPQCb+T4rbU9ZEoU= +golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/net v0.0.0-20190227160552-c95aed5357e7 h1:C2F/nMkR/9sfUTpvR3QrjBuTdvMUC/cFajkphs1YLQo= +golang.org/x/net v0.0.0-20190227160552-c95aed5357e7/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223 h1:DH4skfRX4EBpamg7iV4ZlCpblAHI6s6TDM39bFZumv8= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= diff --git a/parse.go b/parse.go new file mode 100644 index 0000000000000000000000000000000000000000..b0492841e86e98c31289a055fcfd0d238ee09453 --- /dev/null +++ b/parse.go @@ -0,0 +1,114 @@ +package chunk + +import ( + "errors" + "fmt" + "io" + "strconv" + "strings" +) + +const ( + // DefaultBlockSize is the chunk size that splitters produce (or aim to). + DefaultBlockSize int64 = 1024 * 256 + + // No leaf block should contain more than 1MiB of payload data ( wrapping overhead aside ) + // This effectively mandates the maximum chunk size + // See discussion at https://gitlab.dms3.io/dms3/public/go-dms3-chunker/pull/21#discussion_r369124879 for background + ChunkSizeLimit int = 1048576 +) + +var ( + ErrRabinMin = errors.New("rabin min must be greater than 16") + ErrSize = errors.New("chunker size must be greater than 0") + ErrSizeMax = fmt.Errorf("chunker parameters may not exceed the maximum chunk size of %d", ChunkSizeLimit) +) + +// FromString returns a Splitter depending on the given string: +// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}", +// "rabin-{min}-{avg}-{max}" and "buzhash". +func FromString(r io.Reader, chunker string) (Splitter, error) { + switch { + case chunker == "" || chunker == "default": + return DefaultSplitter(r), nil + + case strings.HasPrefix(chunker, "size-"): + sizeStr := strings.Split(chunker, "-")[1] + size, err := strconv.Atoi(sizeStr) + if err != nil { + return nil, err + } else if size <= 0 { + return nil, ErrSize + } else if size > ChunkSizeLimit { + return nil, ErrSizeMax + } + return NewSizeSplitter(r, int64(size)), nil + + case strings.HasPrefix(chunker, "rabin"): + return parseRabinString(r, chunker) + + case chunker == "buzhash": + return NewBuzhash(r), nil + + default: + return nil, fmt.Errorf("unrecognized chunker option: %s", chunker) + } +} + +func parseRabinString(r io.Reader, chunker string) (Splitter, error) { + parts := strings.Split(chunker, "-") + switch len(parts) { + case 1: + return NewRabin(r, uint64(DefaultBlockSize)), nil + case 2: + size, err := strconv.Atoi(parts[1]) + if err != nil { + return nil, err + } else if int(float32(size)*1.5) > ChunkSizeLimit { // FIXME - this will be addressed in a subsequent PR + return nil, ErrSizeMax + } + return NewRabin(r, uint64(size)), nil + case 4: + sub := strings.Split(parts[1], ":") + if len(sub) > 1 && sub[0] != "min" { + return nil, errors.New("first label must be min") + } + min, err := strconv.Atoi(sub[len(sub)-1]) + if err != nil { + return nil, err + } + if min < 16 { + return nil, ErrRabinMin + } + sub = strings.Split(parts[2], ":") + if len(sub) > 1 && sub[0] != "avg" { + log.Error("sub == ", sub) + return nil, errors.New("second label must be avg") + } + avg, err := strconv.Atoi(sub[len(sub)-1]) + if err != nil { + return nil, err + } + + sub = strings.Split(parts[3], ":") + if len(sub) > 1 && sub[0] != "max" { + return nil, errors.New("final label must be max") + } + max, err := strconv.Atoi(sub[len(sub)-1]) + if err != nil { + return nil, err + } + + if min >= avg { + return nil, errors.New("incorrect format: rabin-min must be smaller than rabin-avg") + } else if avg >= max { + return nil, errors.New("incorrect format: rabin-avg must be smaller than rabin-max") + } else if max > ChunkSizeLimit { + return nil, ErrSizeMax + } + + return NewRabinMinMax(r, uint64(min), uint64(avg), uint64(max)), nil + default: + return nil, errors.New("incorrect format (expected 'rabin' 'rabin-[avg]' or 'rabin-[min]-[avg]-[max]'") + } +} diff --git a/parse_test.go b/parse_test.go new file mode 100644 index 0000000000000000000000000000000000000000..237a2b439a6cc74a0a83a3b63da3affa0e9cee87 --- /dev/null +++ b/parse_test.go @@ -0,0 +1,80 @@ +package chunk + +import ( + "bytes" + "fmt" + "testing" +) + +const ( + testTwoThirdsOfChunkLimit = 2 * (float32(ChunkSizeLimit) / float32(3)) +) + +func TestParseRabin(t *testing.T) { + r := bytes.NewReader(randBuf(t, 1000)) + + _, err := FromString(r, "rabin-18-25-32") + if err != nil { + t.Errorf(err.Error()) + } + + _, err = FromString(r, "rabin-15-23-31") + if err != ErrRabinMin { + t.Fatalf("Expected an 'ErrRabinMin' error, got: %#v", err) + } + + _, err = FromString(r, "rabin-20-20-21") + if err == nil || err.Error() != "incorrect format: rabin-min must be smaller than rabin-avg" { + t.Fatalf("Expected an arg-out-of-order error, got: %#v", err) + } + + _, err = FromString(r, "rabin-19-21-21") + if err == nil || err.Error() != "incorrect format: rabin-avg must be smaller than rabin-max" { + t.Fatalf("Expected an arg-out-of-order error, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-19-21-%d", ChunkSizeLimit)) + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-19-21-%d", 1+ChunkSizeLimit)) + if err != ErrSizeMax { + t.Fatalf("Expected 'ErrSizeMax', got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-%.0f", testTwoThirdsOfChunkLimit)) + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("rabin-%.0f", 1+testTwoThirdsOfChunkLimit)) + if err != ErrSizeMax { + t.Fatalf("Expected 'ErrSizeMax', got: %#v", err) + } + +} + +func TestParseSize(t *testing.T) { + r := bytes.NewReader(randBuf(t, 1000)) + + _, err := FromString(r, "size-0") + if err != ErrSize { + t.Fatalf("Expected an 'ErrSize' error, got: %#v", err) + } + + _, err = FromString(r, "size-32") + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("size-%d", ChunkSizeLimit)) + if err != nil { + t.Fatalf("Expected success, got: %#v", err) + } + + _, err = FromString(r, fmt.Sprintf("size-%d", 1+ChunkSizeLimit)) + if err != ErrSizeMax { + t.Fatalf("Expected 'ErrSizeMax', got: %#v", err) + } +} diff --git a/rabin.go b/rabin.go new file mode 100644 index 0000000000000000000000000000000000000000..467e38819ddfe45389eedced2ca707b195434fc2 --- /dev/null +++ b/rabin.go @@ -0,0 +1,54 @@ +package chunk + +import ( + "hash/fnv" + "io" + + "github.com/whyrusleeping/chunker" +) + +// Dms3RabinPoly is the irreducible polynomial of degree 53 used by for Rabin. +var Dms3RabinPoly = chunker.Pol(17437180132763653) + +// Rabin implements the Splitter interface and splits content with Rabin +// fingerprints. +type Rabin struct { + r *chunker.Chunker + reader io.Reader +} + +// NewRabin creates a new Rabin splitter with the given +// average block size. +func NewRabin(r io.Reader, avgBlkSize uint64) *Rabin { + min := avgBlkSize / 3 + max := avgBlkSize + (avgBlkSize / 2) + + return NewRabinMinMax(r, min, avgBlkSize, max) +} + +// NewRabinMinMax returns a new Rabin splitter which uses +// the given min, average and max block sizes. +func NewRabinMinMax(r io.Reader, min, avg, max uint64) *Rabin { + h := fnv.New32a() + ch := chunker.New(r, Dms3RabinPoly, h, avg, min, max) + + return &Rabin{ + r: ch, + reader: r, + } +} + +// NextBytes reads the next bytes from the reader and returns a slice. +func (r *Rabin) NextBytes() ([]byte, error) { + ch, err := r.r.Next() + if err != nil { + return nil, err + } + + return ch.Data, nil +} + +// Reader returns the io.Reader associated to this Splitter. +func (r *Rabin) Reader() io.Reader { + return r.reader +} diff --git a/rabin_test.go b/rabin_test.go new file mode 100644 index 0000000000000000000000000000000000000000..76ed671850c63de38bbb757f9b367fbc519b133a --- /dev/null +++ b/rabin_test.go @@ -0,0 +1,108 @@ +package chunk + +import ( + "bytes" + "fmt" + "io" + "testing" + + blocks "gitlab.dms3.io/dms3/public/go-block-format" + util "gitlab.dms3.io/dms3/public/go-dms3-util" +) + +func TestRabinChunking(t *testing.T) { + data := make([]byte, 1024*1024*16) + n, err := util.NewTimeSeededRand().Read(data) + if n < len(data) { + t.Fatalf("expected %d bytes, got %d", len(data), n) + } + if err != nil { + t.Fatal(err) + } + + r := NewRabin(bytes.NewReader(data), 1024*256) + + var chunks [][]byte + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + chunks = append(chunks, chunk) + } + + fmt.Printf("average block size: %d\n", len(data)/len(chunks)) + + unchunked := bytes.Join(chunks, nil) + if !bytes.Equal(unchunked, data) { + fmt.Printf("%d %d\n", len(unchunked), len(data)) + t.Fatal("data was chunked incorrectly") + } +} + +func chunkData(t *testing.T, newC newSplitter, data []byte) map[string]blocks.Block { + r := newC(bytes.NewReader(data)) + + blkmap := make(map[string]blocks.Block) + + for { + blk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + b := blocks.NewBlock(blk) + blkmap[b.Cid().KeyString()] = b + } + + return blkmap +} + +func testReuse(t *testing.T, cr newSplitter) { + data := make([]byte, 1024*1024*16) + n, err := util.NewTimeSeededRand().Read(data) + if n < len(data) { + t.Fatalf("expected %d bytes, got %d", len(data), n) + } + if err != nil { + t.Fatal(err) + } + + ch1 := chunkData(t, cr, data[1000:]) + ch2 := chunkData(t, cr, data) + + var extra int + for k := range ch2 { + _, ok := ch1[k] + if !ok { + extra++ + } + } + + if extra > 2 { + t.Logf("too many spare chunks made: %d", extra) + } +} + +func TestRabinChunkReuse(t *testing.T) { + newRabin := func(r io.Reader) Splitter { + return NewRabin(r, 256*1024) + } + testReuse(t, newRabin) +} + +var Res uint64 + +func BenchmarkRabin(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return NewRabin(r, 256<<10) + }) +} diff --git a/splitting.go b/splitting.go new file mode 100644 index 0000000000000000000000000000000000000000..81f40ccf5c68e955d67afca8befa2d039c662e0d --- /dev/null +++ b/splitting.go @@ -0,0 +1,102 @@ +// Package chunk implements streaming block splitters. +// Splitters read data from a reader and provide byte slices (chunks) +// The size and contents of these slices depend on the splitting method +// used. +package chunk + +import ( + "io" + + pool "github.com/libp2p/go-buffer-pool" + logging "gitlab.dms3.io/dms3/public/go-log" +) + +var log = logging.Logger("chunk") + +// A Splitter reads bytes from a Reader and creates "chunks" (byte slices) +// that can be used to build DAG nodes. +type Splitter interface { + Reader() io.Reader + NextBytes() ([]byte, error) +} + +// SplitterGen is a splitter generator, given a reader. +type SplitterGen func(r io.Reader) Splitter + +// DefaultSplitter returns a SizeSplitter with the DefaultBlockSize. +func DefaultSplitter(r io.Reader) Splitter { + return NewSizeSplitter(r, DefaultBlockSize) +} + +// SizeSplitterGen returns a SplitterGen function which will create +// a splitter with the given size when called. +func SizeSplitterGen(size int64) SplitterGen { + return func(r io.Reader) Splitter { + return NewSizeSplitter(r, size) + } +} + +// Chan returns a channel that receives each of the chunks produced +// by a splitter, along with another one for errors. +func Chan(s Splitter) (<-chan []byte, <-chan error) { + out := make(chan []byte) + errs := make(chan error, 1) + go func() { + defer close(out) + defer close(errs) + + // all-chunks loop (keep creating chunks) + for { + b, err := s.NextBytes() + if err != nil { + errs <- err + return + } + + out <- b + } + }() + return out, errs +} + +type sizeSplitterv2 struct { + r io.Reader + size uint32 + err error +} + +// NewSizeSplitter returns a new size-based Splitter with the given block size. +func NewSizeSplitter(r io.Reader, size int64) Splitter { + return &sizeSplitterv2{ + r: r, + size: uint32(size), + } +} + +// NextBytes produces a new chunk. +func (ss *sizeSplitterv2) NextBytes() ([]byte, error) { + if ss.err != nil { + return nil, ss.err + } + + full := pool.Get(int(ss.size)) + n, err := io.ReadFull(ss.r, full) + switch err { + case io.ErrUnexpectedEOF: + ss.err = io.EOF + small := make([]byte, n) + copy(small, full) + pool.Put(full) + return small, nil + case nil: + return full, nil + default: + pool.Put(full) + return nil, err + } +} + +// Reader returns the io.Reader associated to this Splitter. +func (ss *sizeSplitterv2) Reader() io.Reader { + return ss.r +} diff --git a/splitting_test.go b/splitting_test.go new file mode 100644 index 0000000000000000000000000000000000000000..1988303726807f55e58f5442c68cf0db843ed3d2 --- /dev/null +++ b/splitting_test.go @@ -0,0 +1,126 @@ +package chunk + +import ( + "bytes" + "io" + "testing" + + u "gitlab.dms3.io/dms3/public/go-dms3-util" +) + +func randBuf(t *testing.T, size int) []byte { + buf := make([]byte, size) + if _, err := u.NewTimeSeededRand().Read(buf); err != nil { + t.Fatal("failed to read enough randomness") + } + return buf +} + +func copyBuf(buf []byte) []byte { + cpy := make([]byte, len(buf)) + copy(cpy, buf) + return cpy +} + +func TestSizeSplitterOverAllocate(t *testing.T) { + max := 1000 + r := bytes.NewReader(randBuf(t, max)) + chunksize := int64(1024 * 256) + splitter := NewSizeSplitter(r, chunksize) + chunk, err := splitter.NextBytes() + if err != nil { + t.Fatal(err) + } + if cap(chunk) > len(chunk) { + t.Fatal("chunk capacity too large") + } +} + +func TestSizeSplitterIsDeterministic(t *testing.T) { + if testing.Short() { + t.SkipNow() + } + + test := func() { + bufR := randBuf(t, 10000000) // crank this up to satisfy yourself. + bufA := copyBuf(bufR) + bufB := copyBuf(bufR) + + chunksA, _ := Chan(DefaultSplitter(bytes.NewReader(bufA))) + chunksB, _ := Chan(DefaultSplitter(bytes.NewReader(bufB))) + + for n := 0; ; n++ { + a, moreA := <-chunksA + b, moreB := <-chunksB + + if !moreA { + if moreB { + t.Fatal("A ended, B didnt.") + } + return + } + + if !bytes.Equal(a, b) { + t.Fatalf("chunk %d not equal", n) + } + } + } + + for run := 0; run < 1; run++ { // crank this up to satisfy yourself. + test() + } +} + +func TestSizeSplitterFillsChunks(t *testing.T) { + if testing.Short() { + t.SkipNow() + } + + max := 10000000 + b := randBuf(t, max) + r := &clipReader{r: bytes.NewReader(b), size: 4000} + chunksize := int64(1024 * 256) + c, _ := Chan(NewSizeSplitter(r, chunksize)) + + sofar := 0 + whole := make([]byte, max) + for chunk := range c { + + bc := b[sofar : sofar+len(chunk)] + if !bytes.Equal(bc, chunk) { + t.Fatalf("chunk not correct: (sofar: %d) %d != %d, %v != %v", sofar, len(bc), len(chunk), bc[:100], chunk[:100]) + } + + copy(whole[sofar:], chunk) + + sofar += len(chunk) + if sofar != max && len(chunk) < int(chunksize) { + t.Fatal("sizesplitter split at a smaller size") + } + } + + if !bytes.Equal(b, whole) { + t.Fatal("splitter did not split right") + } +} + +type clipReader struct { + size int + r io.Reader +} + +func (s *clipReader) Read(buf []byte) (int, error) { + + // clip the incoming buffer to produce smaller chunks + if len(buf) > s.size { + buf = buf[:s.size] + } + + return s.r.Read(buf) +} + +func BenchmarkDefault(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return DefaultSplitter(r) + }) +}