package scratch import ( "io" ) // Reader provides zero-copy read methods and other helpful utilities commonly desired in parsers // around either an io.Reader or a plain byte slice. // // Read methods with 'n' in the name take a size parameter for how much to read. // Read methods with a number in the name read that fixed number of bytes. // Read methods with 'b' in the name accept a byte slice parameter which will be used for output, allowing you to control memory reuse. // Read methods with 'z' in the name will attempt to return zero-copy access to buffers controlled by the Reader -- // be careful when using these 'z' methods; it is not recommended to expose the zero-copy slices these methods yield, // because the reader itself may also reuse them, and so the likelihood of spooky-action-at-a-distance bugs is high. // // While this Reader does some buffering, it's not much (and primarily oriented around reuse of scratch buffers rather than intentionly large batch readaheads); // it may still be advisable to use a buffered reader to avoid small reads if reading streamingly from external IO like disk or network. type Reader struct { stream io.Reader // source to keep pumping, or may be nil if we're wrapping a single already-in-memory slice and we know it. buf []byte // alternative to `stream`, if we have a single already-in-memory slice and we know it. cursor int // position of start of next read, if we're using `buf`. scratch [scratchByteArrayLen]byte // temp byte array re-used internally for efficiency during read. 'readz' methods return views into this. numRead int64 // aggregate number of bytes read (since last reset of numRead, anyway). tracked []byte // bytes that have been read while we've been in tracking state. a subslice of `buf` where possible, but may be its own alloc if we're in streaming mode. unread byte // a byte tracked for the potential need for unreading. only used if using `stream`; if using `buf`, we just adjusted `cursor`. isTracking bool // whether we're in the tracking state. canUnread bool // whether unread is currently valid. haveUnread bool // whether we need to replay an unread byte. only checked when in `stream` mode, because `buf` mode just adjusts `cursor`. } // You'll find many implementation methods have a large switch around `z.stream == nil`. // This is effectively a toggle for whether we're operating in streaming mode or on already-in-memory byte slices. // This would've been cleaner code with an interface and two implementations -- no doubt! // However, it ends up less inliner-friendly if an interface is involved. // // Stylistically: I've allowed rightward drift in 'if' cases for stream vs buf mode, // rather than using the usual golang rule of thumb about early returns. I find this easier to read, given the semiotics. // // FUTURE: it may be worth reviewing the utility of this when go1.16 is out -- some of its features for optimization // through interfaces when concrete types can be inferred might change the consequences of this design quite a bit. const ( scratchByteArrayLen = 32 ) var ( zeroByteSlice = []byte{}[:0:0] ) // Init makes this Reader ready to consume the given io.Reader. // If this Reader has been used before, all state is zeroed out cleanly. // // As a convenience, if the io.Reader looks like it can return all the bytes at once // (e.g., it has a `Bytes() []byte` method -- as bytes.Buffer does, for example), // then Init will access that and use InitSlice, which should lead to better performance. func (z *Reader) Init(r io.Reader) { type BytesAccessor interface { Bytes() []byte } if ba, ok := r.(BytesAccessor); ok { z.InitSlice(ba.Bytes()) } else { z.InitReader(r) } } // InitSlice makes this Reader ready to consume the given byte slice. // If this Reader has been used before, all state is zeroed out cleanly. // // InitSlice is functionally equivalent to wrapping the byte slice in a reader and using Init, // but will result in a Reader that generally operates somewhat faster and is able to deliver more zero-copy behaviors. // (When we know we're working with a byte slice that's already entirely in memory, // we never have to worry about read alignment, etc.) func (z *Reader) InitSlice(bs []byte) { *z = Reader{} z.buf = bs } // InitReader makes this Reader ready to consume the given io.Reader. // If this Reader has been used before, all state is zeroed out cleanly. // // Unlike Init, this initializer will not attempt to autodetect any interface // which may provide direct access to underlying byte slices; it will always work in stream mode. func (z *Reader) InitReader(r io.Reader) { *z = Reader{} // FUTURE: this could try to recycle any capacity in z.tracked. z.stream = r } // Readnzc read up to n bytes into a byte slice which may be shared and must not be reused after any additional calls to this reader. // Readnzc will use the implementation scratch buffer if possible, (i.e. n < scratchByteArrayLen), // or may return a view of the []byte being decoded from if the read is larger. // If there is less than n bytes to be read, a shorter slice will be returned, and err will be ErrUnexpectedEOF. // Requesting a zero length read will return `zeroByteSlice`, a len-zero cap-zero slice. // If you know your read may be longer than scratchByteArrayLen and // you already have an existing slice of sufficient size to reuse, prefer `Readb`. func (z *Reader) Readnzc(n int) (bs []byte, err error) { if n == 0 { return zeroByteSlice, nil } z.canUnread = false if z.stream == nil { // in `buf` mode, we can just return subslices. remaining := len(z.buf) - z.cursor if n > remaining { // partial read from end of buf n = remaining // mostly the same, just shorter err = io.ErrUnexpectedEOF // and give notice of the short read } bs = z.buf[z.cursor : z.cursor+n] z.cursor += n z.numRead += int64(n) if z.isTracking { z.tracked = z.tracked[:len(z.tracked)+n] // See TestTechniqueSliceExtension if this bewilders you. } return } else { // in `stream` mode, we'll set up buffers, then use Readb do to most of the work. if n < len(z.scratch) { // read from stream and fits in scratch bs = z.scratch[:n] } else { // read from stream and needs a new allocation bs = make([]byte, n) // this is a sadpath; you should've used Readb. } n, err = z.readStream(bs) return bs[:n], err } } // Readb reads up to `len(b)` bytes into the given slice, starting at its beginning, // overwriting all values, and disregarding any extra capacity. // If the there is less than `len(b)` bytes to be read, a partial read will be returned: // some of the slice will be modified, n will be less than the slice length, and err will be ErrUnexpectedEOF. // (If you're intentionally providing a larger slice than may be necessary in order to get a batch read, // you will want to check for and discard ErrUnexpectedEOF!) // If no error is returned, n will always be the length of the slice. // // Readb will never return a zero-copy subslice of an existing buffer; // use one of the 'Read*z*' methods for that. func (z *Reader) Readb(bs []byte) (n int, err error) { if len(bs) == 0 { return 0, nil } z.canUnread = false if z.stream == nil { // in `buf` mode, we can just return subslices. n = len(bs) remaining := len(z.buf) - z.cursor if n > remaining { // partial read from end of buf n = remaining // mostly the same, just shorter err = io.ErrUnexpectedEOF // and give notice of the short read } copy(bs, z.buf[z.cursor:z.cursor+n]) z.cursor += n z.numRead += int64(n) if z.isTracking { z.tracked = z.tracked[:len(z.tracked)+n] // See TestTechniqueSliceExtension if this bewilders you. } return } else { return z.readStream(bs) } } func (z *Reader) readStream(bs []byte) (n int, err error) { // fun note: a corresponding readBuf method turned out not useful to create, // because the different return conventions of the exported methods actually matter to what shortcuts we can take when wrangling raw slices // (whereas the impact of those return conventions turn out not to carry as far when we already have to handle extra slices as we do in `stream` mode). // In `stream` mode, we first handle replaying unreads if necessary; then, use io.ReadAtLeast to load as much data as requested. if z.haveUnread { bs[0] = z.unread z.haveUnread = false n, err = io.ReadAtLeast(z.stream, bs[1:], len(bs)-1) n++ } else { n, err = io.ReadAtLeast(z.stream, bs, len(bs)) } z.numRead += int64(n) if z.isTracking { z.tracked = append(z.tracked, bs[:n]...) } return } // Readn reads up to n bytes into a new byte slice. // If there is less than n bytes to be read, a shorter slice will be returned, and err will be ErrUnexpectedEOF. // If zero-copy views into existing buffers are acceptable (e.g. you know you // won't later mutate, reference or expose this memory again), prefer `Readnzc`. // If you already have an existing slice of sufficient size to reuse, prefer `Readb`. // Requesting a zero length read will return `zeroByteSlice`, a len-zero cap-zero slice. // // Readn will never return a zero-copy subslice of an existing buffer; // use one of the 'Read*z*' methods for that. // (Readn is purely a convenience method; you can always use Readb to equivalent effect.) func (z *Reader) Readn(n int) (bs []byte, err error) { if n == 0 { return zeroByteSlice, nil } // This really is just a convenience method. It's the same regardless of mode we're in. bs = make([]byte, n) n, err = z.Readb(bs) return bs[:n], err } // Readn1 reads a single byte. func (z *Reader) Readn1() (byte, error) { // Just use Readnzc, which handles any tracking shifts, and also any unread replays, transparently. // Hopefully the compiler is clever enough to make the assembly shorter than the source. // REVIEW: may want to look especially at the benchmark and the assembly on this; it might be improvable by hand-rolling more of this specialization, // and it's probably important to do so, considering how much of parsing for textual formats like json involves single-byte scanning. bs, err := z.Readnzc(1) if err != nil { return 0, err } z.canUnread = true z.unread = bs[0] return bs[0], nil } // Unreadn1 "unreads" a single byte which was previously read by Readn1. // The result is that subsequent reads will include that byte, // and applying the Track method will also cause the track result to include that byte. // // Unreadn1 can only be used when the previous call was Readn1, and may panic otherwise. func (z *Reader) Unreadn1() { if !z.canUnread { panic("Unreadn1 can only be used following Readn1") } z.canUnread = false z.numRead-- if z.isTracking { z.tracked = z.tracked[0 : len(z.tracked)-1] } if z.stream == nil { z.cursor-- } else { z.haveUnread = true } } func (z *Reader) NumRead() int64 { return z.numRead } func (z *Reader) ResetNumRead() { z.numRead = 0 } // Track causes the Reader to place a marker and accumulate all bytes into a single contiguous slice // up until StopTrack is called; StopTrack will return a reference to this slice. // Thus, StopTrack will yield bytes which have already also been seen via other read method calls. // // This can be useful when parsing logic requires scanning ahead to look for the end of an unknown-length segment of data, for example. // // Calling Track twice without an intervening StopTrack will result in panic. func (z *Reader) Track() { if z.isTracking { panic("Track cannot be called again when already tracking") } z.isTracking = true if z.stream == nil { // save the start position. we'll just extend the length of it over the cap of buf as we go forward. z.tracked = z.buf[z.cursor:z.cursor] } else { // nothing to do for stream mode; it'll just accumulate naturally through appends. } } // StopTrack returns the byte slice accumulated since Track was called, and drops the marker. // // Calling StopTrack when Track is not in effect will result in panic. // // The slice returned by StopTrack may be reused if Track is called again in the future; // the caller should copy the contents to a new byte slice before the next call to Track // they intend to either make this data available for a long time or to mutate it. func (z *Reader) StopTrack() []byte { if !z.isTracking { panic("StopTrack cannot be called when not tracking") } z.isTracking = false answer := z.tracked z.tracked = z.tracked[0:0] return answer }