// SPDX-FileCopyrightText: 2023 Jon Lundy // SPDX-License-Identifier: BSD-3-Clause package lsm import ( "bytes" crand "crypto/rand" "encoding/base64" "io" "io/fs" "math/rand" "os" "sort" "sync" "testing" "time" "github.com/matryer/is" ) func TestLargeFile(t *testing.T) { is := is.New(t) segCount := 4098 f := randFile(t, 2_000_000, segCount) sf, err := ReadFile(f) is.NoErr(err) is.True(len(sf.segments) <= segCount) var needle []byte for i, s := range sf.segments { e, err := s.FirstEntry() is.NoErr(err) k, v := e.KeyValue() needle = k t.Logf("Segment-%d: %s = %d", i, k, v) } t.Log(f.Stat()) tt, ok, err := sf.Find(needle, false) is.NoErr(err) is.True(ok) key, val := tt.KeyValue() t.Log(string(key), val) tt, ok, err = sf.Find([]byte("needle"), false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() t.Log(string(key), val) tt, ok, err = sf.Find([]byte{'\xff'}, false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() t.Log(string(key), val) } func TestLargeFileDisk(t *testing.T) { is := is.New(t) segCount := 4098 t.Log("generate large file") f := randFile(t, 2_000_000, segCount) fd, err := os.CreateTemp("", "sst*") is.NoErr(err) defer func() { t.Log("cleanup:", fd.Name()); fd.Close(); os.Remove(fd.Name()) }() t.Log("write file:", fd.Name()) _, err = io.Copy(fd, f) is.NoErr(err) fd.Seek(0, 0) sf, err := ReadFile(fd) is.NoErr(err) is.True(len(sf.segments) <= segCount) var needle []byte for i, s := range sf.segments { e, err := s.FirstEntry() is.NoErr(err) k, v := e.KeyValue() needle = k ok, err := s.VerifyHash() is.NoErr(err) t.Logf("Segment-%d: %s = %d %t", i, k, v, ok) is.True(ok) } t.Log(f.Stat()) tt, ok, err := sf.Find(needle, false) is.NoErr(err) is.True(ok) key, val := tt.KeyValue() t.Log(string(key), val) tt, ok, err = sf.Find([]byte("needle"), false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() t.Log(string(key), val) tt, ok, err = sf.Find([]byte{'\xff'}, false) is.NoErr(err) is.True(!ok) key, val = tt.KeyValue() t.Log(string(key), val) } func BenchmarkLargeFile(b *testing.B) { segCount := 4098 / 4 f := randFile(b, 2_000_000, segCount) sf, err := ReadFile(f) if err != nil { b.Error(err) } key := make([]byte, 5) keys := make([][]byte, b.N) for i := range keys { _, err = crand.Read(key) if err != nil { b.Error(err) } keys[i] = []byte(base64.RawURLEncoding.EncodeToString(key)) } b.Log("ready", b.N) b.ResetTimer() okays := 0 each := b.N / 10 for n := 0; n < b.N; n++ { if each > 0 && n%each == 0 { b.Log(n) } _, ok, err := sf.Find(keys[n], false) if err != nil { b.Error(err) } if ok { okays++ } } b.Log("okays=", b.N, okays) } // TestFindRange is an initial range find for start and stop of a range of needles. // TODO: start the second query from where the first left off. Use an iterator? func TestFindRange(t *testing.T) { is := is.New(t) f := basicFile(t, entries{ {"AD", 5}, {"AC", 5}, {"AB", 4}, {"AB", 3}, }, entries{ {"AB", 2}, {"AA", 1}, }, ) sf, err := ReadFile(f) is.NoErr(err) var ok bool var first, last *entryBytes first, ok, err = sf.Find([]byte("AB"), true) is.NoErr(err) key, val := first.KeyValue() t.Log(string(key), val) is.True(ok) is.Equal(key, []byte("AB")) is.Equal(val, uint64(2)) last, ok, err = sf.Find([]byte("AC"), false) is.NoErr(err) key, val = last.KeyValue() t.Log(string(key), val) is.True(ok) is.Equal(key, []byte("AC")) is.Equal(val, uint64(5)) } func randFile(t interface { Helper() Error(...any) }, size int, segments int) fs.File { t.Helper() lis := make(listEntries, size) for i := range lis { key := make([]byte, 5) _, err := crand.Read(key) if err != nil { t.Error(err) } key = []byte(base64.RawURLEncoding.EncodeToString(key)) // key := []byte(fmt.Sprintf("key-%05d", i)) lis[i] = NewKeyValue(key, rand.Uint64()%16_777_216) } sort.Sort(sort.Reverse(&lis)) each := size / segments if size%segments != 0 { each++ } split := make([]listEntries, segments) for i := range split { if (i+1)*each > len(lis) { split[i] = lis[i*each : i*each+len(lis[i*each:])] split = split[:i+1] break } split[i] = lis[i*each : (i+1)*each] } var b bytes.Buffer for _, s := range split { s.WriteTo(&b) } return NewFile(b.Bytes()) } type fakeStat struct { size int64 } // IsDir implements fs.FileInfo. func (*fakeStat) IsDir() bool { panic("unimplemented") } // ModTime implements fs.FileInfo. func (*fakeStat) ModTime() time.Time { panic("unimplemented") } // Mode implements fs.FileInfo. func (*fakeStat) Mode() fs.FileMode { panic("unimplemented") } // Name implements fs.FileInfo. func (*fakeStat) Name() string { panic("unimplemented") } // Size implements fs.FileInfo. func (s *fakeStat) Size() int64 { return s.size } // Sys implements fs.FileInfo. func (*fakeStat) Sys() any { panic("unimplemented") } var _ fs.FileInfo = (*fakeStat)(nil) type rd interface { io.ReaderAt io.Reader } type fakeFile struct { stat func() fs.FileInfo rd } func (fakeFile) Close() error { return nil } func (f fakeFile) Stat() (fs.FileInfo, error) { return f.stat(), nil } func NewFile(b ...[]byte) fs.File { in := bytes.Join(b, nil) rd := bytes.NewReader(in) size := int64(len(in)) return &fakeFile{stat: func() fs.FileInfo { return &fakeStat{size: size} }, rd: rd} } func NewFileFromReader(rd *bytes.Reader) fs.File { return &fakeFile{stat: func() fs.FileInfo { return &fakeStat{size: int64(rd.Len())} }, rd: rd} } type fakeFS struct { files map[string]*fakeFile mu sync.RWMutex } // Open implements fs.FS. func (f *fakeFS) Open(name string) (fs.File, error) { f.mu.RLock() defer f.mu.RUnlock() if file, ok := f.files[name]; ok { return file, nil } return nil, fs.ErrNotExist } var _ fs.FS = (*fakeFS)(nil)