From 59eaef2ae3599db7276debc55348b8f400b9e5c6 Mon Sep 17 00:00:00 2001 From: xuu Date: Sat, 28 Oct 2023 19:40:29 -0600 Subject: [PATCH] chore(lsm): add initial range search --- lsm/marshal_test.go | 31 ++++++++------- lsm/sst.go | 69 ++++++++++++++++++++++++--------- lsm/sst_test.go | 94 ++++++++++++++++++++++++++------------------- 3 files changed, 123 insertions(+), 71 deletions(-) diff --git a/lsm/marshal_test.go b/lsm/marshal_test.go index e67f0a0..8bdcffa 100644 --- a/lsm/marshal_test.go +++ b/lsm/marshal_test.go @@ -44,7 +44,14 @@ func TestReverse(t *testing.T) { func TestFile(t *testing.T) { is := is.New(t) - f := basicFile(t) + entries := entries { + {"key-1", 1}, + {"key-2", 2}, + {"key-3", 3}, + {"longerkey-4", 65535}, + } + + f := basicFile(t, entries, entries, entries) sf, err := ReadFile(f) is.NoErr(err) @@ -52,20 +59,18 @@ func TestFile(t *testing.T) { is.Equal(len(sf.segments), 3) } -func basicFile(t *testing.T) fs.File { +func basicFile(t *testing.T, lis ...entries) fs.File { t.Helper() - data := segment{entries: entries{ - {"key-1", 1}, - {"key-2", 2}, - {"key-3", 3}, - {"longerkey-4", 65535}, - }} - - b, err := data.MarshalBinary() - if err != nil { - t.Error(err) + segments := make([][]byte, len(lis)) + var err error + for i, entries := range lis { + data := segment{entries: entries} + segments[i], err = data.MarshalBinary() + if err != nil { + t.Error(err) + } } - return NewFile(b, b, b) + return NewFile(segments...) } diff --git a/lsm/sst.go b/lsm/sst.go index 5f6c833..199ad50 100644 --- a/lsm/sst.go +++ b/lsm/sst.go @@ -1,5 +1,12 @@ // SPDX-FileCopyrightText: 2023 Jon Lundy // SPDX-License-Identifier: BSD-3-Clause + +// lsm -- Log Structured Merge-Tree +// +// This is a basic LSM tree using a SSTable optimized for append only writing. On disk data is organized into time ordered +// files of segments, containing reverse sorted keys. Each segment ends with a magic value `Souris\x01`, a 4byte hash, count of +// segment entries, and data length. + package lsm import ( @@ -23,11 +30,11 @@ var ( ) type header struct { - sig []byte - entries uint64 - datalen uint64 - headlen uint64 - end int64 + sig []byte // 4Byte signature + entries uint64 // count of entries in segment + datalen uint64 // length of data + headlen uint64 // length of header + end int64 // location of end of data/start of header (start of data is `end - datalen`) } // ReadHead parse header from a segment. reads from the end of slice of length segmentFooterLength @@ -173,8 +180,21 @@ func (s *segmentReader) FirstEntry() (*entryBytes, error) { return e, err } +func (s *segmentReader) VerifyHash() (bool, error) { + h := hash() + data := make([]byte, s.head.datalen) + _, err := s.rd.ReadAt(data, s.head.end-int64(s.head.datalen)) + if err != nil { + return false, err + } + _, err = h.Write(data) + ok := bytes.Equal(h.Sum(nil), s.head.sig) + + return ok, err +} + // Find locates needle within a segment. if it cant find it will return the nearest key before needle. -func (s *segmentReader) Find(needle []byte) (*entryBytes, bool, error) { +func (s *segmentReader) Find(needle []byte, first bool) (*entryBytes, bool, error) { if s == nil { return nil, false, nil } @@ -184,23 +204,27 @@ func (s *segmentReader) Find(needle []byte) (*entryBytes, bool, error) { } last := e + found := false for pos > 0 { key, _ := e.KeyValue() switch bytes.Compare(key, needle) { + case 1: // key=ccc, needle=bbb + return last, found, nil case 0: // equal - return e, true, nil + if first { + return e, true, nil + } + found = true + fallthrough case -1: // key=aaa, needle=bbb last = e e, pos, err = s.readEntryAt(pos) if err != nil { - return nil, false, err + return nil, found, err } - - case 1: // key=ccc, needle=bbb - return last, false, nil } } - return last, false, nil + return last, found, nil } func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) { if pos < 0 { @@ -217,7 +241,10 @@ func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) { } type logFile struct { - rd interface{io.ReaderAt; io.WriterTo} + rd interface { + io.ReaderAt + io.WriterTo + } segments []segmentReader fs.File @@ -232,7 +259,10 @@ func ReadFile(fd fs.File) (*logFile, error) { } eof := stat.Size() - if rd, ok := fd.(interface{io.ReaderAt; io.WriterTo}); ok { + if rd, ok := fd.(interface { + io.ReaderAt + io.WriterTo + }); ok { l.rd = rd } else { @@ -243,8 +273,8 @@ func ReadFile(fd fs.File) (*logFile, error) { l.rd = bytes.NewReader(rd) } + head := make([]byte, segmentFooterLength) for eof > 0 { - head := make([]byte, segmentFooterLength) _, err = l.rd.ReadAt(head, eof-int64(segmentFooterLength)) if err != nil { return nil, err @@ -285,7 +315,7 @@ func (l *logFile) LoadSegment(pos int64) (*segmentBytes, error) { return &segmentBytes{b, -1}, nil } -func (l *logFile) Find(needle []byte) (*entryBytes, bool, error) { +func (l *logFile) Find(needle []byte, first bool) (*entryBytes, bool, error) { var last segmentReader for _, s := range l.segments { @@ -294,13 +324,16 @@ func (l *logFile) Find(needle []byte) (*entryBytes, bool, error) { return nil, false, err } k, _ := e.KeyValue() - if bytes.Compare(k, needle) > 0 { + if first && bytes.Compare(k, needle) >= 0 { + break + } + if !first && bytes.Compare(k, needle) > 0 { break } last = s } - return last.Find(needle) + return last.Find(needle, first) } func (l *logFile) WriteTo(w io.Writer) (int64, error) { return l.rd.WriteTo(w) diff --git a/lsm/sst_test.go b/lsm/sst_test.go index 839a924..319a09a 100644 --- a/lsm/sst_test.go +++ b/lsm/sst_test.go @@ -1,5 +1,6 @@ // SPDX-FileCopyrightText: 2023 Jon Lundy // SPDX-License-Identifier: BSD-3-Clause + package lsm import ( @@ -39,19 +40,19 @@ func TestLargeFile(t *testing.T) { } t.Log(f.Stat()) - tt, ok, err := sf.Find(needle) + tt, ok, err := sf.Find(needle, false) is.NoErr(err) is.True(ok) key, val := tt.KeyValue() t.Log(string(key), val) - tt, ok, err = sf.Find([]byte("needle")) + tt, ok, err = sf.Find([]byte("needle"), false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() t.Log(string(key), val) - tt, ok, err = sf.Find([]byte{'\xff'}) + tt, ok, err = sf.Find([]byte{'\xff'}, false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() @@ -85,23 +86,28 @@ func TestLargeFileDisk(t *testing.T) { is.NoErr(err) k, v := e.KeyValue() needle = k - t.Logf("Segment-%d: %s = %d", i, k, v) + + ok, err := s.VerifyHash() + is.NoErr(err) + + t.Logf("Segment-%d: %s = %d %t", i, k, v, ok) + is.True(ok) } t.Log(f.Stat()) - tt, ok, err := sf.Find(needle) + tt, ok, err := sf.Find(needle, false) is.NoErr(err) is.True(ok) key, val := tt.KeyValue() t.Log(string(key), val) - tt, ok, err = sf.Find([]byte("needle")) + tt, ok, err = sf.Find([]byte("needle"), false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() t.Log(string(key), val) - tt, ok, err = sf.Find([]byte{'\xff'}) + tt, ok, err = sf.Find([]byte{'\xff'}, false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() @@ -133,7 +139,7 @@ func BenchmarkLargeFile(b *testing.B) { if each > 0 && n%each == 0 { b.Log(n) } - _, ok, err := sf.Find(keys[n]) + _, ok, err := sf.Find(keys[n], false) if err != nil { b.Error(err) } @@ -144,40 +150,48 @@ func BenchmarkLargeFile(b *testing.B) { b.Log("okays=", b.N, okays) } -func BenchmarkLargeFileB(b *testing.B) { - segCount := 4098 / 16 - f := randFile(b, 2_000_000, segCount) +// TestFindRange is an initial range find for start and stop of a range of needles. +// TODO: start the second query from where the first left off. Use an iterator? +func TestFindRange(t *testing.T) { + is := is.New(t) + f := basicFile(t, + entries{ + {"AD", 5}, + {"AC", 5}, + {"AB", 4}, + {"AB", 3}, + }, + entries{ + {"AB", 2}, + {"AA", 1}, + }, + ) sf, err := ReadFile(f) - if err != nil { - b.Error(err) - } - key := make([]byte, 5) - keys := make([][]byte, b.N) - for i := range keys { - _, err = crand.Read(key) - if err != nil { - b.Error(err) - } - keys[i] = []byte(base64.RawURLEncoding.EncodeToString(key)) - } - b.Log("ready", b.N) - b.ResetTimer() - okays := 0 - each := b.N / 10 - for n := 0; n < b.N; n++ { - if each > 0 && n%each == 0 { - b.Log(n) - } - _, ok, err := sf.Find(keys[n]) - if err != nil { - b.Error(err) - } - if ok { - okays++ - } - } - b.Log("okays=", b.N, okays) + is.NoErr(err) + + var ok bool + var first, last *entryBytes + + first, ok, err = sf.Find([]byte("AB"), true) + is.NoErr(err) + + key, val := first.KeyValue() + t.Log(string(key), val) + + is.True(ok) + is.Equal(key, []byte("AB")) + is.Equal(val, uint64(2)) + + last, ok, err = sf.Find([]byte("AC"), false) + is.NoErr(err) + + key, val = last.KeyValue() + t.Log(string(key), val) + + is.True(ok) + is.Equal(key, []byte("AC")) + is.Equal(val, uint64(5)) } func randFile(t interface {