chore(lsm): add initial range search

This commit is contained in:
xuu 2023-10-28 19:40:29 -06:00
parent ddd21b39a6
commit 59eaef2ae3
Signed by: xuu
GPG Key ID: 8B3B0604F164E04F
3 changed files with 123 additions and 71 deletions

View File

@ -44,7 +44,14 @@ func TestReverse(t *testing.T) {
func TestFile(t *testing.T) {
is := is.New(t)
f := basicFile(t)
entries := entries {
{"key-1", 1},
{"key-2", 2},
{"key-3", 3},
{"longerkey-4", 65535},
}
f := basicFile(t, entries, entries, entries)
sf, err := ReadFile(f)
is.NoErr(err)
@ -52,20 +59,18 @@ func TestFile(t *testing.T) {
is.Equal(len(sf.segments), 3)
}
func basicFile(t *testing.T) fs.File {
func basicFile(t *testing.T, lis ...entries) fs.File {
t.Helper()
data := segment{entries: entries{
{"key-1", 1},
{"key-2", 2},
{"key-3", 3},
{"longerkey-4", 65535},
}}
b, err := data.MarshalBinary()
if err != nil {
t.Error(err)
segments := make([][]byte, len(lis))
var err error
for i, entries := range lis {
data := segment{entries: entries}
segments[i], err = data.MarshalBinary()
if err != nil {
t.Error(err)
}
}
return NewFile(b, b, b)
return NewFile(segments...)
}

View File

@ -1,5 +1,12 @@
// SPDX-FileCopyrightText: 2023 Jon Lundy <jon@xuu.cc>
// SPDX-License-Identifier: BSD-3-Clause
// lsm -- Log Structured Merge-Tree
//
// This is a basic LSM tree using a SSTable optimized for append only writing. On disk data is organized into time ordered
// files of segments, containing reverse sorted keys. Each segment ends with a magic value `Souris\x01`, a 4byte hash, count of
// segment entries, and data length.
package lsm
import (
@ -23,11 +30,11 @@ var (
)
type header struct {
sig []byte
entries uint64
datalen uint64
headlen uint64
end int64
sig []byte // 4Byte signature
entries uint64 // count of entries in segment
datalen uint64 // length of data
headlen uint64 // length of header
end int64 // location of end of data/start of header (start of data is `end - datalen`)
}
// ReadHead parse header from a segment. reads from the end of slice of length segmentFooterLength
@ -173,8 +180,21 @@ func (s *segmentReader) FirstEntry() (*entryBytes, error) {
return e, err
}
func (s *segmentReader) VerifyHash() (bool, error) {
h := hash()
data := make([]byte, s.head.datalen)
_, err := s.rd.ReadAt(data, s.head.end-int64(s.head.datalen))
if err != nil {
return false, err
}
_, err = h.Write(data)
ok := bytes.Equal(h.Sum(nil), s.head.sig)
return ok, err
}
// Find locates needle within a segment. if it cant find it will return the nearest key before needle.
func (s *segmentReader) Find(needle []byte) (*entryBytes, bool, error) {
func (s *segmentReader) Find(needle []byte, first bool) (*entryBytes, bool, error) {
if s == nil {
return nil, false, nil
}
@ -184,23 +204,27 @@ func (s *segmentReader) Find(needle []byte) (*entryBytes, bool, error) {
}
last := e
found := false
for pos > 0 {
key, _ := e.KeyValue()
switch bytes.Compare(key, needle) {
case 1: // key=ccc, needle=bbb
return last, found, nil
case 0: // equal
return e, true, nil
if first {
return e, true, nil
}
found = true
fallthrough
case -1: // key=aaa, needle=bbb
last = e
e, pos, err = s.readEntryAt(pos)
if err != nil {
return nil, false, err
return nil, found, err
}
case 1: // key=ccc, needle=bbb
return last, false, nil
}
}
return last, false, nil
return last, found, nil
}
func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) {
if pos < 0 {
@ -217,7 +241,10 @@ func (s *segmentReader) readEntryAt(pos int64) (*entryBytes, int64, error) {
}
type logFile struct {
rd interface{io.ReaderAt; io.WriterTo}
rd interface {
io.ReaderAt
io.WriterTo
}
segments []segmentReader
fs.File
@ -232,7 +259,10 @@ func ReadFile(fd fs.File) (*logFile, error) {
}
eof := stat.Size()
if rd, ok := fd.(interface{io.ReaderAt; io.WriterTo}); ok {
if rd, ok := fd.(interface {
io.ReaderAt
io.WriterTo
}); ok {
l.rd = rd
} else {
@ -243,8 +273,8 @@ func ReadFile(fd fs.File) (*logFile, error) {
l.rd = bytes.NewReader(rd)
}
head := make([]byte, segmentFooterLength)
for eof > 0 {
head := make([]byte, segmentFooterLength)
_, err = l.rd.ReadAt(head, eof-int64(segmentFooterLength))
if err != nil {
return nil, err
@ -285,7 +315,7 @@ func (l *logFile) LoadSegment(pos int64) (*segmentBytes, error) {
return &segmentBytes{b, -1}, nil
}
func (l *logFile) Find(needle []byte) (*entryBytes, bool, error) {
func (l *logFile) Find(needle []byte, first bool) (*entryBytes, bool, error) {
var last segmentReader
for _, s := range l.segments {
@ -294,13 +324,16 @@ func (l *logFile) Find(needle []byte) (*entryBytes, bool, error) {
return nil, false, err
}
k, _ := e.KeyValue()
if bytes.Compare(k, needle) > 0 {
if first && bytes.Compare(k, needle) >= 0 {
break
}
if !first && bytes.Compare(k, needle) > 0 {
break
}
last = s
}
return last.Find(needle)
return last.Find(needle, first)
}
func (l *logFile) WriteTo(w io.Writer) (int64, error) {
return l.rd.WriteTo(w)

View File

@ -1,5 +1,6 @@
// SPDX-FileCopyrightText: 2023 Jon Lundy <jon@xuu.cc>
// SPDX-License-Identifier: BSD-3-Clause
package lsm
import (
@ -39,19 +40,19 @@ func TestLargeFile(t *testing.T) {
}
t.Log(f.Stat())
tt, ok, err := sf.Find(needle)
tt, ok, err := sf.Find(needle, false)
is.NoErr(err)
is.True(ok)
key, val := tt.KeyValue()
t.Log(string(key), val)
tt, ok, err = sf.Find([]byte("needle"))
tt, ok, err = sf.Find([]byte("needle"), false)
is.NoErr(err)
is.True(!ok)
key, val = tt.KeyValue()
t.Log(string(key), val)
tt, ok, err = sf.Find([]byte{'\xff'})
tt, ok, err = sf.Find([]byte{'\xff'}, false)
is.NoErr(err)
is.True(!ok)
key, val = tt.KeyValue()
@ -85,23 +86,28 @@ func TestLargeFileDisk(t *testing.T) {
is.NoErr(err)
k, v := e.KeyValue()
needle = k
t.Logf("Segment-%d: %s = %d", i, k, v)
ok, err := s.VerifyHash()
is.NoErr(err)
t.Logf("Segment-%d: %s = %d %t", i, k, v, ok)
is.True(ok)
}
t.Log(f.Stat())
tt, ok, err := sf.Find(needle)
tt, ok, err := sf.Find(needle, false)
is.NoErr(err)
is.True(ok)
key, val := tt.KeyValue()
t.Log(string(key), val)
tt, ok, err = sf.Find([]byte("needle"))
tt, ok, err = sf.Find([]byte("needle"), false)
is.NoErr(err)
is.True(!ok)
key, val = tt.KeyValue()
t.Log(string(key), val)
tt, ok, err = sf.Find([]byte{'\xff'})
tt, ok, err = sf.Find([]byte{'\xff'}, false)
is.NoErr(err)
is.True(!ok)
key, val = tt.KeyValue()
@ -133,7 +139,7 @@ func BenchmarkLargeFile(b *testing.B) {
if each > 0 && n%each == 0 {
b.Log(n)
}
_, ok, err := sf.Find(keys[n])
_, ok, err := sf.Find(keys[n], false)
if err != nil {
b.Error(err)
}
@ -144,40 +150,48 @@ func BenchmarkLargeFile(b *testing.B) {
b.Log("okays=", b.N, okays)
}
func BenchmarkLargeFileB(b *testing.B) {
segCount := 4098 / 16
f := randFile(b, 2_000_000, segCount)
// TestFindRange is an initial range find for start and stop of a range of needles.
// TODO: start the second query from where the first left off. Use an iterator?
func TestFindRange(t *testing.T) {
is := is.New(t)
f := basicFile(t,
entries{
{"AD", 5},
{"AC", 5},
{"AB", 4},
{"AB", 3},
},
entries{
{"AB", 2},
{"AA", 1},
},
)
sf, err := ReadFile(f)
if err != nil {
b.Error(err)
}
key := make([]byte, 5)
keys := make([][]byte, b.N)
for i := range keys {
_, err = crand.Read(key)
if err != nil {
b.Error(err)
}
keys[i] = []byte(base64.RawURLEncoding.EncodeToString(key))
}
b.Log("ready", b.N)
b.ResetTimer()
okays := 0
each := b.N / 10
for n := 0; n < b.N; n++ {
if each > 0 && n%each == 0 {
b.Log(n)
}
_, ok, err := sf.Find(keys[n])
if err != nil {
b.Error(err)
}
if ok {
okays++
}
}
b.Log("okays=", b.N, okays)
is.NoErr(err)
var ok bool
var first, last *entryBytes
first, ok, err = sf.Find([]byte("AB"), true)
is.NoErr(err)
key, val := first.KeyValue()
t.Log(string(key), val)
is.True(ok)
is.Equal(key, []byte("AB"))
is.Equal(val, uint64(2))
last, ok, err = sf.Find([]byte("AC"), false)
is.NoErr(err)
key, val = last.KeyValue()
t.Log(string(key), val)
is.True(ok)
is.Equal(key, []byte("AC"))
is.Equal(val, uint64(5))
}
func randFile(t interface {