chore(lsm): add initial range search

This commit is contained in:
xuu 2023-10-28 19:40:29 -06:00
parent ddd21b39a6
commit 59eaef2ae3
Signed by: xuu
GPG Key ID: 8B3B0604F164E04F
3 changed files with 123 additions and 71 deletions

View File

@ -44,7 +44,14 @@ func TestReverse(t *testing.T) {
func TestFile(t *testing.T) { func TestFile(t *testing.T) {
is := is.New(t) is := is.New(t)
f := basicFile(t) entries := entries {
{"key-1", 1},
{"key-2", 2},
{"key-3", 3},
{"longerkey-4", 65535},
}
f := basicFile(t, entries, entries, entries)
sf, err := ReadFile(f) sf, err := ReadFile(f)
is.NoErr(err) is.NoErr(err)
@ -52,20 +59,18 @@ func TestFile(t *testing.T) {
is.Equal(len(sf.segments), 3) is.Equal(len(sf.segments), 3)
} }
func basicFile(t *testing.T) fs.File { func basicFile(t *testing.T, lis ...entries) fs.File {
t.Helper() t.Helper()
data := segment{entries: entries{ segments := make([][]byte, len(lis))
{"key-1", 1}, var err error
{"key-2", 2}, for i, entries := range lis {
{"key-3", 3}, data := segment{entries: entries}
{"longerkey-4", 65535}, segments[i], err = data.MarshalBinary()
}} if err != nil {
t.Error(err)
b, err := data.MarshalBinary() }
if err != nil {
t.Error(err)
} }
return NewFile(b, b, b) return NewFile(segments...)
} }

View File

@ -1,5 +1,12 @@
// SPDX-FileCopyrightText: 2023 Jon Lundy <jon@xuu.cc> // SPDX-FileCopyrightText: 2023 Jon Lundy <jon@xuu.cc>
// SPDX-License-Identifier: BSD-3-Clause // SPDX-License-Identifier: BSD-3-Clause
// lsm -- Log Structured Merge-Tree
//
// This is a basic LSM tree using a SSTable optimized for append only writing. On disk data is organized into time ordered
// files of segments, containing reverse sorted keys. Each segment ends with a magic value `Souris\x01`, a 4byte hash, count of
// segment entries, and data length.
package lsm package lsm
import ( import (
@ -23,11 +30,11 @@ var (
) )
type header struct { type header struct {
sig []byte sig []byte // 4Byte signature
entries uint64 entries uint64 // count of entries in segment
datalen uint64 datalen uint64 // length of data
headlen uint64 headlen uint64 // length of header
end int64 end int64 // location of end of data/start of header (start of data is `end - datalen`)
} }
// ReadHead parse header from a segment. reads from the end of slice of length segmentFooterLength // ReadHead parse header from a segment. reads from the end of slice of length segmentFooterLength
@ -173,8 +180,21 @@ func (s *segmentReader) FirstEntry() (*entryBytes, error) {
return e, err return e, err
} }
func (s *segmentReader) VerifyHash() (bool, error) {
h := hash()
data := make([]byte, s.head.datalen)
_, err := s.rd.ReadAt(data, s.head.end-int64(s.head.datalen))
if err != nil {
return false, err
}
_, err = h.Write(data)
ok := bytes.Equal(h.Sum(nil), s.head.sig)
return ok, err
}
// Find locates needle within a segment. if it cant find it will return the nearest key before needle. // Find locates needle within a segment. if it cant find it will return the nearest key before needle.
func (s *segmentReader) Find(needle []byte) (*entryBytes, bool, error) { func (s *segmentReader) Find(needle []byte, first bool) (*entryBytes, bool, error) {
if s == nil { if s == nil {
return nil, false, nil return nil, false, nil
} }
@ -184,23 +204,27 @@ func (s *segmentReader) Find(needle []byte) (*entryBytes, bool, error) {
} }
last := e last := e
found := false
for pos > 0 { for pos > 0 {
key, _ := e.KeyValue() key, _ := e.KeyValue()
switch bytes.Compare(key, needle) { switch bytes.Compare(key, needle) {
case 1: // key=ccc, needle=bbb
return last, found, nil
case 0: // equal case 0: // equal
return e, true, nil if first {
return e, true, nil
}
found = true
fallthrough
case -1: // key=aaa, needle=bbb case -1: // key=aaa, needle=bbb
last = e last = e
e, pos, err = s.readEntryAt(pos) e, pos, err = s.readEntryAt(pos)
if err != nil { if err != nil {
return nil, false, err return nil, found, err
} }
case 1: // key=ccc, needle=bbb
return last, false, nil
} }
} }
return last, false, nil return last, found, nil
} }
func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) { func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) {
if pos < 0 { if pos < 0 {
@ -217,7 +241,10 @@ func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) {
} }
type logFile struct { type logFile struct {
rd interface{io.ReaderAt; io.WriterTo} rd interface {
io.ReaderAt
io.WriterTo
}
segments []segmentReader segments []segmentReader
fs.File fs.File
@ -232,7 +259,10 @@ func ReadFile(fd fs.File) (*logFile, error) {
} }
eof := stat.Size() eof := stat.Size()
if rd, ok := fd.(interface{io.ReaderAt; io.WriterTo}); ok { if rd, ok := fd.(interface {
io.ReaderAt
io.WriterTo
}); ok {
l.rd = rd l.rd = rd
} else { } else {
@ -243,8 +273,8 @@ func ReadFile(fd fs.File) (*logFile, error) {
l.rd = bytes.NewReader(rd) l.rd = bytes.NewReader(rd)
} }
head := make([]byte, segmentFooterLength)
for eof > 0 { for eof > 0 {
head := make([]byte, segmentFooterLength)
_, err = l.rd.ReadAt(head, eof-int64(segmentFooterLength)) _, err = l.rd.ReadAt(head, eof-int64(segmentFooterLength))
if err != nil { if err != nil {
return nil, err return nil, err
@ -285,7 +315,7 @@ func (l *logFile) LoadSegment(pos int64) (*segmentBytes, error) {
return &segmentBytes{b, -1}, nil return &segmentBytes{b, -1}, nil
} }
func (l *logFile) Find(needle []byte) (*entryBytes, bool, error) { func (l *logFile) Find(needle []byte, first bool) (*entryBytes, bool, error) {
var last segmentReader var last segmentReader
for _, s := range l.segments { for _, s := range l.segments {
@ -294,13 +324,16 @@ func (l *logFile) Find(needle []byte) (*entryBytes, bool, error) {
return nil, false, err return nil, false, err
} }
k, _ := e.KeyValue() k, _ := e.KeyValue()
if bytes.Compare(k, needle) > 0 { if first && bytes.Compare(k, needle) >= 0 {
break
}
if !first && bytes.Compare(k, needle) > 0 {
break break
} }
last = s last = s
} }
return last.Find(needle) return last.Find(needle, first)
} }
func (l *logFile) WriteTo(w io.Writer) (int64, error) { func (l *logFile) WriteTo(w io.Writer) (int64, error) {
return l.rd.WriteTo(w) return l.rd.WriteTo(w)

View File

@ -1,5 +1,6 @@
// SPDX-FileCopyrightText: 2023 Jon Lundy <jon@xuu.cc> // SPDX-FileCopyrightText: 2023 Jon Lundy <jon@xuu.cc>
// SPDX-License-Identifier: BSD-3-Clause // SPDX-License-Identifier: BSD-3-Clause
package lsm package lsm
import ( import (
@ -39,19 +40,19 @@ func TestLargeFile(t *testing.T) {
} }
t.Log(f.Stat()) t.Log(f.Stat())
tt, ok, err := sf.Find(needle) tt, ok, err := sf.Find(needle, false)
is.NoErr(err) is.NoErr(err)
is.True(ok) is.True(ok)
key, val := tt.KeyValue() key, val := tt.KeyValue()
t.Log(string(key), val) t.Log(string(key), val)
tt, ok, err = sf.Find([]byte("needle")) tt, ok, err = sf.Find([]byte("needle"), false)
is.NoErr(err) is.NoErr(err)
is.True(!ok) is.True(!ok)
key, val = tt.KeyValue() key, val = tt.KeyValue()
t.Log(string(key), val) t.Log(string(key), val)
tt, ok, err = sf.Find([]byte{'\xff'}) tt, ok, err = sf.Find([]byte{'\xff'}, false)
is.NoErr(err) is.NoErr(err)
is.True(!ok) is.True(!ok)
key, val = tt.KeyValue() key, val = tt.KeyValue()
@ -85,23 +86,28 @@ func TestLargeFileDisk(t *testing.T) {
is.NoErr(err) is.NoErr(err)
k, v := e.KeyValue() k, v := e.KeyValue()
needle = k needle = k
t.Logf("Segment-%d: %s = %d", i, k, v)
ok, err := s.VerifyHash()
is.NoErr(err)
t.Logf("Segment-%d: %s = %d %t", i, k, v, ok)
is.True(ok)
} }
t.Log(f.Stat()) t.Log(f.Stat())
tt, ok, err := sf.Find(needle) tt, ok, err := sf.Find(needle, false)
is.NoErr(err) is.NoErr(err)
is.True(ok) is.True(ok)
key, val := tt.KeyValue() key, val := tt.KeyValue()
t.Log(string(key), val) t.Log(string(key), val)
tt, ok, err = sf.Find([]byte("needle")) tt, ok, err = sf.Find([]byte("needle"), false)
is.NoErr(err) is.NoErr(err)
is.True(!ok) is.True(!ok)
key, val = tt.KeyValue() key, val = tt.KeyValue()
t.Log(string(key), val) t.Log(string(key), val)
tt, ok, err = sf.Find([]byte{'\xff'}) tt, ok, err = sf.Find([]byte{'\xff'}, false)
is.NoErr(err) is.NoErr(err)
is.True(!ok) is.True(!ok)
key, val = tt.KeyValue() key, val = tt.KeyValue()
@ -133,7 +139,7 @@ func BenchmarkLargeFile(b *testing.B) {
if each > 0 && n%each == 0 { if each > 0 && n%each == 0 {
b.Log(n) b.Log(n)
} }
_, ok, err := sf.Find(keys[n]) _, ok, err := sf.Find(keys[n], false)
if err != nil { if err != nil {
b.Error(err) b.Error(err)
} }
@ -144,40 +150,48 @@ func BenchmarkLargeFile(b *testing.B) {
b.Log("okays=", b.N, okays) b.Log("okays=", b.N, okays)
} }
func BenchmarkLargeFileB(b *testing.B) { // TestFindRange is an initial range find for start and stop of a range of needles.
segCount := 4098 / 16 // TODO: start the second query from where the first left off. Use an iterator?
f := randFile(b, 2_000_000, segCount) func TestFindRange(t *testing.T) {
is := is.New(t)
f := basicFile(t,
entries{
{"AD", 5},
{"AC", 5},
{"AB", 4},
{"AB", 3},
},
entries{
{"AB", 2},
{"AA", 1},
},
)
sf, err := ReadFile(f) sf, err := ReadFile(f)
if err != nil { is.NoErr(err)
b.Error(err)
} var ok bool
key := make([]byte, 5) var first, last *entryBytes
keys := make([][]byte, b.N)
for i := range keys { first, ok, err = sf.Find([]byte("AB"), true)
_, err = crand.Read(key) is.NoErr(err)
if err != nil {
b.Error(err) key, val := first.KeyValue()
} t.Log(string(key), val)
keys[i] = []byte(base64.RawURLEncoding.EncodeToString(key))
} is.True(ok)
b.Log("ready", b.N) is.Equal(key, []byte("AB"))
b.ResetTimer() is.Equal(val, uint64(2))
okays := 0
each := b.N / 10 last, ok, err = sf.Find([]byte("AC"), false)
for n := 0; n < b.N; n++ { is.NoErr(err)
if each > 0 && n%each == 0 {
b.Log(n) key, val = last.KeyValue()
} t.Log(string(key), val)
_, ok, err := sf.Find(keys[n])
if err != nil { is.True(ok)
b.Error(err) is.Equal(key, []byte("AC"))
} is.Equal(val, uint64(5))
if ok {
okays++
}
}
b.Log("okays=", b.N, okays)
} }
func randFile(t interface { func randFile(t interface {