golang
diff --git a/‎gopls/internal/bloom/filter.go
Lines changed: 105 additions & 0 deletions b/‎gopls/internal/bloom/filter.go
Lines changed: 105 additions & 0 deletions
diff --git a/‎gopls/internal/bloom/filter_test.go
Lines changed: 93 additions & 0 deletions b/‎gopls/internal/bloom/filter_test.go
Lines changed: 93 additions & 0 deletions
@@ -0,0 +1,105 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bloom
+
+import (
+	"hash/maphash"
+	"math"
+)
+
+// block is the element type of the filter bitfield.
+type block = uint8
+
+const blockBits = 8
+
+// Filter is a bloom filter for a set of strings.
+type Filter struct {
+	seeds  []maphash.Seed
+	blocks []block
+}
+
+// NewFilter constructs a new Filter with the given elements.
+func NewFilter(elems []string) *Filter {
+	// Tolerate a 5% false positive rate.
+	nblocks, nseeds := calibrate(0.05, len(elems))
+	f := &Filter{
+		blocks: make([]block, nblocks),
+		seeds:  make([]maphash.Seed, nseeds),
+	}
+	for i := range nseeds {
+		f.seeds[i] = maphash.MakeSeed()
+	}
+	for _, elem := range elems {
+		for _, seed := range f.seeds {
+			index, bit := f.locate(seed, elem)
+			f.blocks[index] |= bit
+		}
+	}
+	return f
+}
+
+// locate returns the block index and bit corresponding to the given hash seed and
+// string.
+func (f *Filter) locate(seed maphash.Seed, s string) (index int, bit block) {
+	h := uint(maphash.String(seed, s))
+	blk := h / blockBits % uint(len(f.blocks))
+	bit = block(1 << (h % blockBits))
+	return int(blk), bit
+}
+
+func assert(cond bool, msg string) {
+	if !cond {
+		panic(msg)
+	}
+}
+
+// calibrate approximates the number of blocks and seeds to use for a bloom
+// filter with desired false positive rate fpRate, given n elements.
+func calibrate(fpRate float64, n int) (blocks, seeds int) {
+	// We following the terms of https://en.wikipedia.org/wiki/Bloom_filter:
+	// - k is the number of hash functions,
+	// - m is the size of the bit field;
+	// - n is the number of set bits.
+
+	assert(0 < fpRate && fpRate < 1, "invalid false positive rate")
+	assert(n >= 0, "invalid set size")
+
+	if n == 0 {
+		// degenerate case; use the simplest filter
+		return 1, 1
+	}
+
+	// Calibrate the number of blocks based on the optimal number of bits per
+	// element. In this case we round up, as more bits leads to fewer false
+	// positives.
+	logFpRate := math.Log(fpRate) // reused for k below
+	m := -(float64(n) * logFpRate) / (math.Ln2 * math.Ln2)
+	blocks = int(m) / blockBits
+	if float64(blocks*blockBits) < m {
+		blocks += 1
+	}
+
+	// Estimate the number of hash functions (=seeds). This is imprecise, not
+	// least since the formula in the article above assumes that the number of
+	// bits per element is not rounded.
+	//
+	// Here we round to the nearest integer (not unconditionally round up), since
+	// more hash functions do not always lead to better results.
+	k := -logFpRate / math.Ln2
+	seeds = max(int(math.Round(k)), 1)
+
+	return blocks, seeds
+}
+
+// MayContain reports whether the filter may contain s.
+func (f *Filter) MayContain(s string) bool {
+	for _, seed := range f.seeds {
+		index, bit := f.locate(seed, s)
+		if f.blocks[index]&bit == 0 {
+			return false
+		}
+	}
+	return true
+}
@@ -0,0 +1,93 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bloom
+
+import (
+	"math"
+	"math/rand/v2"
+	"testing"
+)
+
+func TestFilter(t *testing.T) {
+	elems := []string{
+		"a", "apple", "b", "banana", "an arbitrarily long string", "", "世界",
+	}
+
+	// First, sanity check that the filter contains all the given elements.
+	f := NewFilter(elems)
+	for _, elem := range elems {
+		if got := f.MayContain(elem); !got {
+			t.Errorf("MayContain(%q) = %t, want true", elem, got)
+		}
+	}
+
+	// Measure the false positives rate.
+	//
+	// Of course, we can't assert on the results, since they are probabilistic,
+	// but this can be useful for interactive use.
+
+	fpRate := falsePositiveRate(len(f.blocks), len(f.seeds), len(elems))
+	t.Logf("%d blocks, %d seeds, %.2g%% expected false positives", len(f.blocks), len(f.seeds), 100*fpRate)
+
+	// In practice, all positives below will be false, but be precise anyway.
+	truePositive := make(map[string]bool)
+	for _, e := range elems {
+		truePositive[e] = true
+	}
+
+	// Generate a large number of random strings to measure the false positive
+	// rate.
+	g := newStringGenerator()
+	const samples = 1000
+	falsePositives := 0
+	for range samples {
+		s := g.next()
+		got := f.MayContain(s)
+		if false {
+			t.Logf("MayContain(%q) = %t", s, got)
+		}
+		if got && !truePositive[s] {
+			falsePositives++
+		}
+	}
+	t.Logf("false positives: %.1f%% (%d/%d)", 100*float64(falsePositives)/float64(samples), falsePositives, samples)
+}
+
+// falsePositiveRate estimates the expected false positive rate for a filter
+// with the given number of blocks, seeds, and elements.
+func falsePositiveRate(block, seeds, elems int) float64 {
+	k, m, n := float64(seeds), float64(block*blockBits), float64(elems)
+	return math.Pow(1-math.Exp(-k*n/m), k)
+}
+
+type stringGenerator struct {
+	r *rand.Rand
+}
+
+func newStringGenerator() *stringGenerator {
+	return &stringGenerator{rand.New(rand.NewPCG(1, 2))}
+}
+
+func (g *stringGenerator) next() string {
+	l := g.r.IntN(50) // length
+	var runes []rune
+	for range l {
+		runes = append(runes, rune(' '+rand.IntN('~'-' ')))
+	}
+	return string(runes)
+}
+
+// TestDegenerateFilter checks that the degenerate filter with no elements
+// results in no false positives.
+func TestDegenerateFilter(t *testing.T) {
+	f := NewFilter(nil)
+	g := newStringGenerator()
+	for range 100 {
+		s := g.next()
+		if f.MayContain(s) {
+			t.Errorf("MayContain(%q) = true, want false", s)
+		}
+	}
+}