Initial commit: Go 1.23 release state
This commit is contained in:
22
src/index/suffixarray/example_test.go
Normal file
22
src/index/suffixarray/example_test.go
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package suffixarray_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"index/suffixarray"
|
||||
)
|
||||
|
||||
func ExampleIndex_Lookup() {
|
||||
index := suffixarray.New([]byte("banana"))
|
||||
offsets := index.Lookup([]byte("ana"), -1)
|
||||
for _, off := range offsets {
|
||||
fmt.Println(off)
|
||||
}
|
||||
|
||||
// Unordered output:
|
||||
// 1
|
||||
// 3
|
||||
}
|
||||
92
src/index/suffixarray/gen.go
Normal file
92
src/index/suffixarray/gen.go
Normal file
@@ -0,0 +1,92 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build ignore
|
||||
|
||||
// Gen generates sais2.go by duplicating functions in sais.go
|
||||
// using different input types.
|
||||
// See the comment at the top of sais.go for details.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
log.SetPrefix("gen: ")
|
||||
log.SetFlags(0)
|
||||
|
||||
data, err := os.ReadFile("sais.go")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
x := bytes.Index(data, []byte("\n\n"))
|
||||
if x < 0 {
|
||||
log.Fatal("cannot find blank line after copyright comment")
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
buf.Write(data[:x])
|
||||
buf.WriteString("\n\n// Code generated by go generate; DO NOT EDIT.\n\npackage suffixarray\n")
|
||||
|
||||
for {
|
||||
x := bytes.Index(data, []byte("\nfunc "))
|
||||
if x < 0 {
|
||||
break
|
||||
}
|
||||
data = data[x:]
|
||||
p := bytes.IndexByte(data, '(')
|
||||
if p < 0 {
|
||||
p = len(data)
|
||||
}
|
||||
name := string(data[len("\nfunc "):p])
|
||||
|
||||
x = bytes.Index(data, []byte("\n}\n"))
|
||||
if x < 0 {
|
||||
log.Fatalf("cannot find end of func %s", name)
|
||||
}
|
||||
fn := string(data[:x+len("\n}\n")])
|
||||
data = data[x+len("\n}"):]
|
||||
|
||||
if strings.HasSuffix(name, "_32") {
|
||||
buf.WriteString(fix32.Replace(fn))
|
||||
}
|
||||
if strings.HasSuffix(name, "_8_32") {
|
||||
// x_8_32 -> x_8_64 done above
|
||||
fn = fix8_32.Replace(stripByteOnly(fn))
|
||||
buf.WriteString(fn)
|
||||
buf.WriteString(fix32.Replace(fn))
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile("sais2.go", buf.Bytes(), 0666); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
var fix32 = strings.NewReplacer(
|
||||
"32", "64",
|
||||
"int32", "int64",
|
||||
)
|
||||
|
||||
var fix8_32 = strings.NewReplacer(
|
||||
"_8_32", "_32",
|
||||
"byte", "int32",
|
||||
)
|
||||
|
||||
func stripByteOnly(s string) string {
|
||||
lines := strings.SplitAfter(s, "\n")
|
||||
w := 0
|
||||
for _, line := range lines {
|
||||
if !strings.Contains(line, "256") && !strings.Contains(line, "byte-only") {
|
||||
lines[w] = line
|
||||
w++
|
||||
}
|
||||
}
|
||||
return strings.Join(lines[:w], "")
|
||||
}
|
||||
895
src/index/suffixarray/sais.go
Normal file
895
src/index/suffixarray/sais.go
Normal file
@@ -0,0 +1,895 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Suffix array construction by induced sorting (SAIS).
|
||||
// See Ge Nong, Sen Zhang, and Wai Hong Chen,
|
||||
// "Two Efficient Algorithms for Linear Time Suffix Array Construction",
|
||||
// especially section 3 (https://ieeexplore.ieee.org/document/5582081).
|
||||
// See also http://zork.net/~st/jottings/sais.html.
|
||||
//
|
||||
// With optimizations inspired by Yuta Mori's sais-lite
|
||||
// (https://sites.google.com/site/yuta256/sais).
|
||||
//
|
||||
// And with other new optimizations.
|
||||
|
||||
// Many of these functions are parameterized by the sizes of
|
||||
// the types they operate on. The generator gen.go makes
|
||||
// copies of these functions for use with other sizes.
|
||||
// Specifically:
|
||||
//
|
||||
// - A function with a name ending in _8_32 takes []byte and []int32 arguments
|
||||
// and is duplicated into _32_32, _8_64, and _64_64 forms.
|
||||
// The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64.
|
||||
// Any lines in the function body that contain the text "byte-only" or "256"
|
||||
// are stripped when creating _32_32 and _64_64 forms.
|
||||
// (Those lines are typically 8-bit-specific optimizations.)
|
||||
//
|
||||
// - A function with a name ending only in _32 operates on []int32
|
||||
// and is duplicated into a _64 form. (Note that it may still take a []byte,
|
||||
// but there is no need for a version of the function in which the []byte
|
||||
// is widened to a full integer array.)
|
||||
|
||||
// The overall runtime of this code is linear in the input size:
|
||||
// it runs a sequence of linear passes to reduce the problem to
|
||||
// a subproblem at most half as big, invokes itself recursively,
|
||||
// and then runs a sequence of linear passes to turn the answer
|
||||
// for the subproblem into the answer for the original problem.
|
||||
// This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N).
|
||||
//
|
||||
// The outline of the code, with the forward and backward scans
|
||||
// through O(N)-sized arrays called out, is:
|
||||
//
|
||||
// sais_I_N
|
||||
// placeLMS_I_B
|
||||
// bucketMax_I_B
|
||||
// freq_I_B
|
||||
// <scan +text> (1)
|
||||
// <scan +freq> (2)
|
||||
// <scan -text, random bucket> (3)
|
||||
// induceSubL_I_B
|
||||
// bucketMin_I_B
|
||||
// freq_I_B
|
||||
// <scan +text, often optimized away> (4)
|
||||
// <scan +freq> (5)
|
||||
// <scan +sa, random text, random bucket> (6)
|
||||
// induceSubS_I_B
|
||||
// bucketMax_I_B
|
||||
// freq_I_B
|
||||
// <scan +text, often optimized away> (7)
|
||||
// <scan +freq> (8)
|
||||
// <scan -sa, random text, random bucket> (9)
|
||||
// assignID_I_B
|
||||
// <scan +sa, random text substrings> (10)
|
||||
// map_B
|
||||
// <scan -sa> (11)
|
||||
// recurse_B
|
||||
// (recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller)
|
||||
// unmap_I_B
|
||||
// <scan -text> (12)
|
||||
// <scan +sa> (13)
|
||||
// expand_I_B
|
||||
// bucketMax_I_B
|
||||
// freq_I_B
|
||||
// <scan +text, often optimized away> (14)
|
||||
// <scan +freq> (15)
|
||||
// <scan -sa, random text, random bucket> (16)
|
||||
// induceL_I_B
|
||||
// bucketMin_I_B
|
||||
// freq_I_B
|
||||
// <scan +text, often optimized away> (17)
|
||||
// <scan +freq> (18)
|
||||
// <scan +sa, random text, random bucket> (19)
|
||||
// induceS_I_B
|
||||
// bucketMax_I_B
|
||||
// freq_I_B
|
||||
// <scan +text, often optimized away> (20)
|
||||
// <scan +freq> (21)
|
||||
// <scan -sa, random text, random bucket> (22)
|
||||
//
|
||||
// Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B).
|
||||
//
|
||||
// The outline shows there are in general 22 scans through
|
||||
// O(N)-sized arrays for a given level of the recursion.
|
||||
// In the top level, operating on 8-bit input text,
|
||||
// the six freq scans are fixed size (256) instead of potentially
|
||||
// input-sized. Also, the frequency is counted once and cached
|
||||
// whenever there is room to do so (there is nearly always room in general,
|
||||
// and always room at the top level), which eliminates all but
|
||||
// the first freq_I_B text scans (that is, 5 of the 6).
|
||||
// So the top level of the recursion only does 22 - 6 - 5 = 11
|
||||
// input-sized scans and a typical level does 16 scans.
|
||||
//
|
||||
// The linear scans do not cost anywhere near as much as
|
||||
// the random accesses to the text made during a few of
|
||||
// the scans (specifically #6, #9, #16, #19, #22 marked above).
|
||||
// In real texts, there is not much but some locality to
|
||||
// the accesses, due to the repetitive structure of the text
|
||||
// (the same reason Burrows-Wheeler compression is so effective).
|
||||
// For random inputs, there is no locality, which makes those
|
||||
// accesses even more expensive, especially once the text
|
||||
// no longer fits in cache.
|
||||
// For example, running on 50 MB of Go source code, induceSubL_8_32
|
||||
// (which runs only once, at the top level of the recursion)
|
||||
// takes 0.44s, while on 50 MB of random input, it takes 2.55s.
|
||||
// Nearly all the relative slowdown is explained by the text access:
|
||||
//
|
||||
// c0, c1 := text[k-1], text[k]
|
||||
//
|
||||
// That line runs for 0.23s on the Go text and 2.02s on random text.
|
||||
|
||||
//go:generate go run gen.go
|
||||
|
||||
package suffixarray
|
||||
|
||||
// text_32 returns the suffix array for the input text.
|
||||
// It requires that len(text) fit in an int32
|
||||
// and that the caller zero sa.
|
||||
func text_32(text []byte, sa []int32) {
|
||||
if int(int32(len(text))) != len(text) || len(text) != len(sa) {
|
||||
panic("suffixarray: misuse of text_32")
|
||||
}
|
||||
sais_8_32(text, 256, sa, make([]int32, 2*256))
|
||||
}
|
||||
|
||||
// sais_8_32 computes the suffix array of text.
|
||||
// The text must contain only values in [0, textMax).
|
||||
// The suffix array is stored in sa, which the caller
|
||||
// must ensure is already zeroed.
|
||||
// The caller must also provide temporary space tmp
|
||||
// with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax
|
||||
// then the algorithm runs a little faster.
|
||||
// If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.
|
||||
func sais_8_32(text []byte, textMax int, sa, tmp []int32) {
|
||||
if len(sa) != len(text) || len(tmp) < textMax {
|
||||
panic("suffixarray: misuse of sais_8_32")
|
||||
}
|
||||
|
||||
// Trivial base cases. Sorting 0 or 1 things is easy.
|
||||
if len(text) == 0 {
|
||||
return
|
||||
}
|
||||
if len(text) == 1 {
|
||||
sa[0] = 0
|
||||
return
|
||||
}
|
||||
|
||||
// Establish slices indexed by text character
|
||||
// holding character frequency and bucket-sort offsets.
|
||||
// If there's only enough tmp for one slice,
|
||||
// we make it the bucket offsets and recompute
|
||||
// the character frequency each time we need it.
|
||||
var freq, bucket []int32
|
||||
if len(tmp) >= 2*textMax {
|
||||
freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
|
||||
freq[0] = -1 // mark as uninitialized
|
||||
} else {
|
||||
freq, bucket = nil, tmp[:textMax]
|
||||
}
|
||||
|
||||
// The SAIS algorithm.
|
||||
// Each of these calls makes one scan through sa.
|
||||
// See the individual functions for documentation
|
||||
// about each's role in the algorithm.
|
||||
numLMS := placeLMS_8_32(text, sa, freq, bucket)
|
||||
if numLMS <= 1 {
|
||||
// 0 or 1 items are already sorted. Do nothing.
|
||||
} else {
|
||||
induceSubL_8_32(text, sa, freq, bucket)
|
||||
induceSubS_8_32(text, sa, freq, bucket)
|
||||
length_8_32(text, sa, numLMS)
|
||||
maxID := assignID_8_32(text, sa, numLMS)
|
||||
if maxID < numLMS {
|
||||
map_32(sa, numLMS)
|
||||
recurse_32(sa, tmp, numLMS, maxID)
|
||||
unmap_8_32(text, sa, numLMS)
|
||||
} else {
|
||||
// If maxID == numLMS, then each LMS-substring
|
||||
// is unique, so the relative ordering of two LMS-suffixes
|
||||
// is determined by just the leading LMS-substring.
|
||||
// That is, the LMS-suffix sort order matches the
|
||||
// (simpler) LMS-substring sort order.
|
||||
// Copy the original LMS-substring order into the
|
||||
// suffix array destination.
|
||||
copy(sa, sa[len(sa)-numLMS:])
|
||||
}
|
||||
expand_8_32(text, freq, bucket, sa, numLMS)
|
||||
}
|
||||
induceL_8_32(text, sa, freq, bucket)
|
||||
induceS_8_32(text, sa, freq, bucket)
|
||||
|
||||
// Mark for caller that we overwrote tmp.
|
||||
tmp[0] = -1
|
||||
}
|
||||
|
||||
// freq_8_32 returns the character frequencies
|
||||
// for text, as a slice indexed by character value.
|
||||
// If freq is nil, freq_8_32 uses and returns bucket.
|
||||
// If freq is non-nil, freq_8_32 assumes that freq[0] >= 0
|
||||
// means the frequencies are already computed.
|
||||
// If the frequency data is overwritten or uninitialized,
|
||||
// the caller must set freq[0] = -1 to force recomputation
|
||||
// the next time it is needed.
|
||||
func freq_8_32(text []byte, freq, bucket []int32) []int32 {
|
||||
if freq != nil && freq[0] >= 0 {
|
||||
return freq // already computed
|
||||
}
|
||||
if freq == nil {
|
||||
freq = bucket
|
||||
}
|
||||
|
||||
freq = freq[:256] // eliminate bounds check for freq[c] below
|
||||
clear(freq)
|
||||
for _, c := range text {
|
||||
freq[c]++
|
||||
}
|
||||
return freq
|
||||
}
|
||||
|
||||
// bucketMin_8_32 stores into bucket[c] the minimum index
|
||||
// in the bucket for character c in a bucket-sort of text.
|
||||
func bucketMin_8_32(text []byte, freq, bucket []int32) {
|
||||
freq = freq_8_32(text, freq, bucket)
|
||||
freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[i] below
|
||||
total := int32(0)
|
||||
for i, n := range freq {
|
||||
bucket[i] = total
|
||||
total += n
|
||||
}
|
||||
}
|
||||
|
||||
// bucketMax_8_32 stores into bucket[c] the maximum index
|
||||
// in the bucket for character c in a bucket-sort of text.
|
||||
// The bucket indexes for c are [min, max).
|
||||
// That is, max is one past the final index in that bucket.
|
||||
func bucketMax_8_32(text []byte, freq, bucket []int32) {
|
||||
freq = freq_8_32(text, freq, bucket)
|
||||
freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[i] below
|
||||
total := int32(0)
|
||||
for i, n := range freq {
|
||||
total += n
|
||||
bucket[i] = total
|
||||
}
|
||||
}
|
||||
|
||||
// The SAIS algorithm proceeds in a sequence of scans through sa.
|
||||
// Each of the following functions implements one scan,
|
||||
// and the functions appear here in the order they execute in the algorithm.
|
||||
|
||||
// placeLMS_8_32 places into sa the indexes of the
|
||||
// final characters of the LMS substrings of text,
|
||||
// sorted into the rightmost ends of their correct buckets
|
||||
// in the suffix array.
|
||||
//
|
||||
// The imaginary sentinel character at the end of the text
|
||||
// is the final character of the final LMS substring, but there
|
||||
// is no bucket for the imaginary sentinel character,
|
||||
// which has a smaller value than any real character.
|
||||
// The caller must therefore pretend that sa[-1] == len(text).
|
||||
//
|
||||
// The text indexes of LMS-substring characters are always ≥ 1
|
||||
// (the first LMS-substring must be preceded by one or more L-type
|
||||
// characters that are not part of any LMS-substring),
|
||||
// so using 0 as a “not present” suffix array entry is safe,
|
||||
// both in this function and in most later functions
|
||||
// (until induceL_8_32 below).
|
||||
func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int {
|
||||
bucketMax_8_32(text, freq, bucket)
|
||||
|
||||
numLMS := 0
|
||||
lastB := int32(-1)
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
|
||||
|
||||
// The next stanza of code (until the blank line) loop backward
|
||||
// over text, stopping to execute a code body at each position i
|
||||
// such that text[i] is an L-character and text[i+1] is an S-character.
|
||||
// That is, i+1 is the position of the start of an LMS-substring.
|
||||
// These could be hoisted out into a function with a callback,
|
||||
// but at a significant speed cost. Instead, we just write these
|
||||
// seven lines a few times in this source file. The copies below
|
||||
// refer back to the pattern established by this original as the
|
||||
// "LMS-substring iterator".
|
||||
//
|
||||
// In every scan through the text, c0, c1 are successive characters of text.
|
||||
// In this backward scan, c0 == text[i] and c1 == text[i+1].
|
||||
// By scanning backward, we can keep track of whether the current
|
||||
// position is type-S or type-L according to the usual definition:
|
||||
//
|
||||
// - position len(text) is type S with text[len(text)] == -1 (the sentinel)
|
||||
// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
|
||||
// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
|
||||
//
|
||||
// The backward scan lets us maintain the current type,
|
||||
// update it when we see c0 != c1, and otherwise leave it alone.
|
||||
// We want to identify all S positions with a preceding L.
|
||||
// Position len(text) is one such position by definition, but we have
|
||||
// nowhere to write it down, so we eliminate it by untruthfully
|
||||
// setting isTypeS = false at the start of the loop.
|
||||
c0, c1, isTypeS := byte(0), byte(0), false
|
||||
for i := len(text) - 1; i >= 0; i-- {
|
||||
c0, c1 = text[i], c0
|
||||
if c0 < c1 {
|
||||
isTypeS = true
|
||||
} else if c0 > c1 && isTypeS {
|
||||
isTypeS = false
|
||||
|
||||
// Bucket the index i+1 for the start of an LMS-substring.
|
||||
b := bucket[c1] - 1
|
||||
bucket[c1] = b
|
||||
sa[b] = int32(i + 1)
|
||||
lastB = b
|
||||
numLMS++
|
||||
}
|
||||
}
|
||||
|
||||
// We recorded the LMS-substring starts but really want the ends.
|
||||
// Luckily, with two differences, the start indexes and the end indexes are the same.
|
||||
// The first difference is that the rightmost LMS-substring's end index is len(text),
|
||||
// so the caller must pretend that sa[-1] == len(text), as noted above.
|
||||
// The second difference is that the first leftmost LMS-substring start index
|
||||
// does not end an earlier LMS-substring, so as an optimization we can omit
|
||||
// that leftmost LMS-substring start index (the last one we wrote).
|
||||
//
|
||||
// Exception: if numLMS <= 1, the caller is not going to bother with
|
||||
// the recursion at all and will treat the result as containing LMS-substring starts.
|
||||
// In that case, we don't remove the final entry.
|
||||
if numLMS > 1 {
|
||||
sa[lastB] = 0
|
||||
}
|
||||
return numLMS
|
||||
}
|
||||
|
||||
// induceSubL_8_32 inserts the L-type text indexes of LMS-substrings
|
||||
// into sa, assuming that the final characters of the LMS-substrings
|
||||
// are already inserted into sa, sorted by final character, and at the
|
||||
// right (not left) end of the corresponding character bucket.
|
||||
// Each LMS-substring has the form (as a regexp) /S+L+S/:
|
||||
// one or more S-type, one or more L-type, final S-type.
|
||||
// induceSubL_8_32 leaves behind only the leftmost L-type text
|
||||
// index for each LMS-substring. That is, it removes the final S-type
|
||||
// indexes that are present on entry, and it inserts but then removes
|
||||
// the interior L-type indexes too.
|
||||
// (Only the leftmost L-type index is needed by induceSubS_8_32.)
|
||||
func induceSubL_8_32(text []byte, sa, freq, bucket []int32) {
|
||||
// Initialize positions for left side of character buckets.
|
||||
bucketMin_8_32(text, freq, bucket)
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
|
||||
|
||||
// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
|
||||
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
|
||||
// Because j-1 is type L, inserting it into sa now will sort it correctly.
|
||||
// But we want to distinguish a j-1 with j-2 of type L from type S.
|
||||
// We can process the former but want to leave the latter for the caller.
|
||||
// We record the difference by negating j-1 if it is preceded by type S.
|
||||
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
|
||||
// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
|
||||
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
|
||||
// and so on, in sorted but not necessarily adjacent order, until it finds
|
||||
// one preceded by an index of type S, at which point it must stop.
|
||||
//
|
||||
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
|
||||
// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
|
||||
// only the indexes of the leftmost L-type indexes for each LMS-substring.
|
||||
//
|
||||
// The suffix array sa therefore serves simultaneously as input, output,
|
||||
// and a miraculously well-tailored work queue.
|
||||
|
||||
// placeLMS_8_32 left out the implicit entry sa[-1] == len(text),
|
||||
// corresponding to the identified type-L index len(text)-1.
|
||||
// Process it before the left-to-right scan of sa proper.
|
||||
// See body in loop for commentary.
|
||||
k := len(text) - 1
|
||||
c0, c1 := text[k-1], text[k]
|
||||
if c0 < c1 {
|
||||
k = -k
|
||||
}
|
||||
|
||||
// Cache recently used bucket index:
|
||||
// we're processing suffixes in sorted order
|
||||
// and accessing buckets indexed by the
|
||||
// byte before the sorted order, which still
|
||||
// has very good locality.
|
||||
// Invariant: b is cached, possibly dirty copy of bucket[cB].
|
||||
cB := c1
|
||||
b := bucket[cB]
|
||||
sa[b] = int32(k)
|
||||
b++
|
||||
|
||||
for i := 0; i < len(sa); i++ {
|
||||
j := int(sa[i])
|
||||
if j == 0 {
|
||||
// Skip empty entry.
|
||||
continue
|
||||
}
|
||||
if j < 0 {
|
||||
// Leave discovered type-S index for caller.
|
||||
sa[i] = int32(-j)
|
||||
continue
|
||||
}
|
||||
sa[i] = 0
|
||||
|
||||
// Index j was on work queue, meaning k := j-1 is L-type,
|
||||
// so we can now place k correctly into sa.
|
||||
// If k-1 is L-type, queue k for processing later in this loop.
|
||||
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
|
||||
k := j - 1
|
||||
c0, c1 := text[k-1], text[k]
|
||||
if c0 < c1 {
|
||||
k = -k
|
||||
}
|
||||
|
||||
if cB != c1 {
|
||||
bucket[cB] = b
|
||||
cB = c1
|
||||
b = bucket[cB]
|
||||
}
|
||||
sa[b] = int32(k)
|
||||
b++
|
||||
}
|
||||
}
|
||||
|
||||
// induceSubS_8_32 inserts the S-type text indexes of LMS-substrings
|
||||
// into sa, assuming that the leftmost L-type text indexes are already
|
||||
// inserted into sa, sorted by LMS-substring suffix, and at the
|
||||
// left end of the corresponding character bucket.
|
||||
// Each LMS-substring has the form (as a regexp) /S+L+S/:
|
||||
// one or more S-type, one or more L-type, final S-type.
|
||||
// induceSubS_8_32 leaves behind only the leftmost S-type text
|
||||
// index for each LMS-substring, in sorted order, at the right end of sa.
|
||||
// That is, it removes the L-type indexes that are present on entry,
|
||||
// and it inserts but then removes the interior S-type indexes too,
|
||||
// leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:].
|
||||
// (Only the LMS-substring start indexes are processed by the recursion.)
|
||||
func induceSubS_8_32(text []byte, sa, freq, bucket []int32) {
|
||||
// Initialize positions for right side of character buckets.
|
||||
bucketMax_8_32(text, freq, bucket)
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
|
||||
|
||||
// Analogous to induceSubL_8_32 above,
|
||||
// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
|
||||
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
|
||||
// Because j-1 is type S, inserting it into sa now will sort it correctly.
|
||||
// But we want to distinguish a j-1 with j-2 of type S from type L.
|
||||
// We can process the former but want to leave the latter for the caller.
|
||||
// We record the difference by negating j-1 if it is preceded by type L.
|
||||
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
|
||||
// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
|
||||
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
|
||||
// and so on, in sorted but not necessarily adjacent order, until it finds
|
||||
// one preceded by an index of type L, at which point it must stop.
|
||||
// That index (preceded by one of type L) is an LMS-substring start.
|
||||
//
|
||||
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
|
||||
// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
|
||||
// so that the loop finishes with the top of sa containing exactly
|
||||
// the LMS-substring start indexes, sorted by LMS-substring.
|
||||
|
||||
// Cache recently used bucket index:
|
||||
cB := byte(0)
|
||||
b := bucket[cB]
|
||||
|
||||
top := len(sa)
|
||||
for i := len(sa) - 1; i >= 0; i-- {
|
||||
j := int(sa[i])
|
||||
if j == 0 {
|
||||
// Skip empty entry.
|
||||
continue
|
||||
}
|
||||
sa[i] = 0
|
||||
if j < 0 {
|
||||
// Leave discovered LMS-substring start index for caller.
|
||||
top--
|
||||
sa[top] = int32(-j)
|
||||
continue
|
||||
}
|
||||
|
||||
// Index j was on work queue, meaning k := j-1 is S-type,
|
||||
// so we can now place k correctly into sa.
|
||||
// If k-1 is S-type, queue k for processing later in this loop.
|
||||
// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
|
||||
k := j - 1
|
||||
c1 := text[k]
|
||||
c0 := text[k-1]
|
||||
if c0 > c1 {
|
||||
k = -k
|
||||
}
|
||||
|
||||
if cB != c1 {
|
||||
bucket[cB] = b
|
||||
cB = c1
|
||||
b = bucket[cB]
|
||||
}
|
||||
b--
|
||||
sa[b] = int32(k)
|
||||
}
|
||||
}
|
||||
|
||||
// length_8_32 computes and records the length of each LMS-substring in text.
|
||||
// The length of the LMS-substring at index j is stored at sa[j/2],
|
||||
// avoiding the LMS-substring indexes already stored in the top half of sa.
|
||||
// (If index j is an LMS-substring start, then index j-1 is type L and cannot be.)
|
||||
// There are two exceptions, made for optimizations in name_8_32 below.
|
||||
//
|
||||
// First, the final LMS-substring is recorded as having length 0, which is otherwise
|
||||
// impossible, instead of giving it a length that includes the implicit sentinel.
|
||||
// This ensures the final LMS-substring has length unequal to all others
|
||||
// and therefore can be detected as different without text comparison
|
||||
// (it is unequal because it is the only one that ends in the implicit sentinel,
|
||||
// and the text comparison would be problematic since the implicit sentinel
|
||||
// is not actually present at text[len(text)]).
|
||||
//
|
||||
// Second, to avoid text comparison entirely, if an LMS-substring is very short,
|
||||
// sa[j/2] records its actual text instead of its length, so that if two such
|
||||
// substrings have matching “length,” the text need not be read at all.
|
||||
// The definition of “very short” is that the text bytes must pack into a uint32,
|
||||
// and the unsigned encoding e must be ≥ len(text), so that it can be
|
||||
// distinguished from a valid length.
|
||||
func length_8_32(text []byte, sa []int32, numLMS int) {
|
||||
end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
|
||||
|
||||
// The encoding of N text bytes into a “length” word
|
||||
// adds 1 to each byte, packs them into the bottom
|
||||
// N*8 bits of a word, and then bitwise inverts the result.
|
||||
// That is, the text sequence A B C (hex 41 42 43)
|
||||
// encodes as ^uint32(0x42_43_44).
|
||||
// LMS-substrings can never start or end with 0xFF.
|
||||
// Adding 1 ensures the encoded byte sequence never
|
||||
// starts or ends with 0x00, so that present bytes can be
|
||||
// distinguished from zero-padding in the top bits,
|
||||
// so the length need not be separately encoded.
|
||||
// Inverting the bytes increases the chance that a
|
||||
// 4-byte encoding will still be ≥ len(text).
|
||||
// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
|
||||
// then the high bit of the inversion will be set,
|
||||
// making it clearly not a valid length (it would be a negative one).
|
||||
//
|
||||
// cx holds the pre-inverted encoding (the packed incremented bytes).
|
||||
cx := uint32(0) // byte-only
|
||||
|
||||
// This stanza (until the blank line) is the "LMS-substring iterator",
|
||||
// described in placeLMS_8_32 above, with one line added to maintain cx.
|
||||
c0, c1, isTypeS := byte(0), byte(0), false
|
||||
for i := len(text) - 1; i >= 0; i-- {
|
||||
c0, c1 = text[i], c0
|
||||
cx = cx<<8 | uint32(c1+1) // byte-only
|
||||
if c0 < c1 {
|
||||
isTypeS = true
|
||||
} else if c0 > c1 && isTypeS {
|
||||
isTypeS = false
|
||||
|
||||
// Index j = i+1 is the start of an LMS-substring.
|
||||
// Compute length or encoded text to store in sa[j/2].
|
||||
j := i + 1
|
||||
var code int32
|
||||
if end == 0 {
|
||||
code = 0
|
||||
} else {
|
||||
code = int32(end - j)
|
||||
if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only
|
||||
code = int32(^cx) // byte-only
|
||||
} // byte-only
|
||||
}
|
||||
sa[j>>1] = code
|
||||
end = j + 1
|
||||
cx = uint32(c1 + 1) // byte-only
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// assignID_8_32 assigns a dense ID numbering to the
|
||||
// set of LMS-substrings respecting string ordering and equality,
|
||||
// returning the maximum assigned ID.
|
||||
// For example given the input "ababab", the LMS-substrings
|
||||
// are "aba", "aba", and "ab", renumbered as 2 2 1.
|
||||
// sa[len(sa)-numLMS:] holds the LMS-substring indexes
|
||||
// sorted in string order, so to assign numbers we can
|
||||
// consider each in turn, removing adjacent duplicates.
|
||||
// The new ID for the LMS-substring at index j is written to sa[j/2],
|
||||
// overwriting the length previously stored there (by length_8_32 above).
|
||||
func assignID_8_32(text []byte, sa []int32, numLMS int) int {
|
||||
id := 0
|
||||
lastLen := int32(-1) // impossible
|
||||
lastPos := int32(0)
|
||||
for _, j := range sa[len(sa)-numLMS:] {
|
||||
// Is the LMS-substring at index j new, or is it the same as the last one we saw?
|
||||
n := sa[j/2]
|
||||
if n != lastLen {
|
||||
goto New
|
||||
}
|
||||
if uint32(n) >= uint32(len(text)) {
|
||||
// “Length” is really encoded full text, and they match.
|
||||
goto Same
|
||||
}
|
||||
{
|
||||
// Compare actual texts.
|
||||
n := int(n)
|
||||
this := text[j:][:n]
|
||||
last := text[lastPos:][:n]
|
||||
for i := 0; i < n; i++ {
|
||||
if this[i] != last[i] {
|
||||
goto New
|
||||
}
|
||||
}
|
||||
goto Same
|
||||
}
|
||||
New:
|
||||
id++
|
||||
lastPos = j
|
||||
lastLen = n
|
||||
Same:
|
||||
sa[j/2] = int32(id)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
// map_32 maps the LMS-substrings in text to their new IDs,
|
||||
// producing the subproblem for the recursion.
|
||||
// The mapping itself was mostly applied by assignID_8_32:
|
||||
// sa[i] is either 0, the ID for the LMS-substring at index 2*i,
|
||||
// or the ID for the LMS-substring at index 2*i+1.
|
||||
// To produce the subproblem we need only remove the zeros
|
||||
// and change ID into ID-1 (our IDs start at 1, but text chars start at 0).
|
||||
//
|
||||
// map_32 packs the result, which is the input to the recursion,
|
||||
// into the top of sa, so that the recursion result can be stored
|
||||
// in the bottom of sa, which sets up for expand_8_32 well.
|
||||
func map_32(sa []int32, numLMS int) {
|
||||
w := len(sa)
|
||||
for i := len(sa) / 2; i >= 0; i-- {
|
||||
j := sa[i]
|
||||
if j > 0 {
|
||||
w--
|
||||
sa[w] = j - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// recurse_32 calls sais_32 recursively to solve the subproblem we've built.
|
||||
// The subproblem is at the right end of sa, the suffix array result will be
|
||||
// written at the left end of sa, and the middle of sa is available for use as
|
||||
// temporary frequency and bucket storage.
|
||||
func recurse_32(sa, oldTmp []int32, numLMS, maxID int) {
|
||||
dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
|
||||
|
||||
// Set up temporary space for recursive call.
|
||||
// We must pass sais_32 a tmp buffer with at least maxID entries.
|
||||
//
|
||||
// The subproblem is guaranteed to have length at most len(sa)/2,
|
||||
// so that sa can hold both the subproblem and its suffix array.
|
||||
// Nearly all the time, however, the subproblem has length < len(sa)/3,
|
||||
// in which case there is a subproblem-sized middle of sa that
|
||||
// we can reuse for temporary space (saTmp).
|
||||
// When recurse_32 is called from sais_8_32, oldTmp is length 512
|
||||
// (from text_32), and saTmp will typically be much larger, so we'll use saTmp.
|
||||
// When deeper recursions come back to recurse_32, now oldTmp is
|
||||
// the saTmp from the top-most recursion, it is typically larger than
|
||||
// the current saTmp (because the current sa gets smaller and smaller
|
||||
// as the recursion gets deeper), and we keep reusing that top-most
|
||||
// large saTmp instead of the offered smaller ones.
|
||||
//
|
||||
// Why is the subproblem length so often just under len(sa)/3?
|
||||
// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
|
||||
// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
|
||||
// in the input, perfect alternation of larger and smaller input bytes.
|
||||
// Real text doesn't do that. If each L-type index is randomly followed
|
||||
// by either an L-type or S-type index, then half the substrings will
|
||||
// be of the form SLS, but the other half will be longer. Of that half,
|
||||
// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
|
||||
// Not counting the final S in each (which overlaps the first S in the next),
|
||||
// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
|
||||
// The space we need is further reduced by the fact that many of the
|
||||
// short patterns like SLS will often be the same character sequences
|
||||
// repeated throughout the text, reducing maxID relative to numLMS.
|
||||
//
|
||||
// For short inputs, the averages may not run in our favor, but then we
|
||||
// can often fall back to using the length-512 tmp available in the
|
||||
// top-most call. (Also a short allocation would not be a big deal.)
|
||||
//
|
||||
// For pathological inputs, we fall back to allocating a new tmp of length
|
||||
// max(maxID, numLMS/2). This level of the recursion needs maxID,
|
||||
// and all deeper levels of the recursion will need no more than numLMS/2,
|
||||
// so this one allocation is guaranteed to suffice for the entire stack
|
||||
// of recursive calls.
|
||||
tmp := oldTmp
|
||||
if len(tmp) < len(saTmp) {
|
||||
tmp = saTmp
|
||||
}
|
||||
if len(tmp) < numLMS {
|
||||
// TestSAIS/forcealloc reaches this code.
|
||||
n := maxID
|
||||
if n < numLMS/2 {
|
||||
n = numLMS / 2
|
||||
}
|
||||
tmp = make([]int32, n)
|
||||
}
|
||||
|
||||
// sais_32 requires that the caller arrange to clear dst,
|
||||
// because in general the caller may know dst is
|
||||
// freshly-allocated and already cleared. But this one is not.
|
||||
clear(dst)
|
||||
sais_32(text, maxID, dst, tmp)
|
||||
}
|
||||
|
||||
// unmap_8_32 unmaps the subproblem back to the original.
|
||||
// sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore.
|
||||
// sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers.
|
||||
// The key part is that if the list says K that means the K'th substring.
|
||||
// We can replace sa[:numLMS] with the indexes of the LMS-substrings.
|
||||
// Then if the list says K it really means sa[K].
|
||||
// Having mapped the list back to LMS-substring indexes,
|
||||
// we can place those into the right buckets.
|
||||
func unmap_8_32(text []byte, sa []int32, numLMS int) {
|
||||
unmap := sa[len(sa)-numLMS:]
|
||||
j := len(unmap)
|
||||
|
||||
// "LMS-substring iterator" (see placeLMS_8_32 above).
|
||||
c0, c1, isTypeS := byte(0), byte(0), false
|
||||
for i := len(text) - 1; i >= 0; i-- {
|
||||
c0, c1 = text[i], c0
|
||||
if c0 < c1 {
|
||||
isTypeS = true
|
||||
} else if c0 > c1 && isTypeS {
|
||||
isTypeS = false
|
||||
|
||||
// Populate inverse map.
|
||||
j--
|
||||
unmap[j] = int32(i + 1)
|
||||
}
|
||||
}
|
||||
|
||||
// Apply inverse map to subproblem suffix array.
|
||||
sa = sa[:numLMS]
|
||||
for i := 0; i < len(sa); i++ {
|
||||
sa[i] = unmap[sa[i]]
|
||||
}
|
||||
}
|
||||
|
||||
// expand_8_32 distributes the compacted, sorted LMS-suffix indexes
|
||||
// from sa[:numLMS] into the tops of the appropriate buckets in sa,
|
||||
// preserving the sorted order and making room for the L-type indexes
|
||||
// to be slotted into the sorted sequence by induceL_8_32.
|
||||
func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) {
|
||||
bucketMax_8_32(text, freq, bucket)
|
||||
bucket = bucket[:256] // eliminate bound check for bucket[c] below
|
||||
|
||||
// Loop backward through sa, always tracking
|
||||
// the next index to populate from sa[:numLMS].
|
||||
// When we get to one, populate it.
|
||||
// Zero the rest of the slots; they have dead values in them.
|
||||
x := numLMS - 1
|
||||
saX := sa[x]
|
||||
c := text[saX]
|
||||
b := bucket[c] - 1
|
||||
bucket[c] = b
|
||||
|
||||
for i := len(sa) - 1; i >= 0; i-- {
|
||||
if i != int(b) {
|
||||
sa[i] = 0
|
||||
continue
|
||||
}
|
||||
sa[i] = saX
|
||||
|
||||
// Load next entry to put down (if any).
|
||||
if x > 0 {
|
||||
x--
|
||||
saX = sa[x] // TODO bounds check
|
||||
c = text[saX]
|
||||
b = bucket[c] - 1
|
||||
bucket[c] = b
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// induceL_8_32 inserts L-type text indexes into sa,
|
||||
// assuming that the leftmost S-type indexes are inserted
|
||||
// into sa, in sorted order, in the right bucket halves.
|
||||
// It leaves all the L-type indexes in sa, but the
|
||||
// leftmost L-type indexes are negated, to mark them
|
||||
// for processing by induceS_8_32.
|
||||
func induceL_8_32(text []byte, sa, freq, bucket []int32) {
|
||||
// Initialize positions for left side of character buckets.
|
||||
bucketMin_8_32(text, freq, bucket)
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
|
||||
|
||||
// This scan is similar to the one in induceSubL_8_32 above.
|
||||
// That one arranges to clear all but the leftmost L-type indexes.
|
||||
// This scan leaves all the L-type indexes and the original S-type
|
||||
// indexes, but it negates the positive leftmost L-type indexes
|
||||
// (the ones that induceS_8_32 needs to process).
|
||||
|
||||
// expand_8_32 left out the implicit entry sa[-1] == len(text),
|
||||
// corresponding to the identified type-L index len(text)-1.
|
||||
// Process it before the left-to-right scan of sa proper.
|
||||
// See body in loop for commentary.
|
||||
k := len(text) - 1
|
||||
c0, c1 := text[k-1], text[k]
|
||||
if c0 < c1 {
|
||||
k = -k
|
||||
}
|
||||
|
||||
// Cache recently used bucket index.
|
||||
cB := c1
|
||||
b := bucket[cB]
|
||||
sa[b] = int32(k)
|
||||
b++
|
||||
|
||||
for i := 0; i < len(sa); i++ {
|
||||
j := int(sa[i])
|
||||
if j <= 0 {
|
||||
// Skip empty or negated entry (including negated zero).
|
||||
continue
|
||||
}
|
||||
|
||||
// Index j was on work queue, meaning k := j-1 is L-type,
|
||||
// so we can now place k correctly into sa.
|
||||
// If k-1 is L-type, queue k for processing later in this loop.
|
||||
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
|
||||
// If k is zero, k-1 doesn't exist, so we only need to leave it
|
||||
// for the caller. The caller can't tell the difference between
|
||||
// an empty slot and a non-empty zero, but there's no need
|
||||
// to distinguish them anyway: the final suffix array will end up
|
||||
// with one zero somewhere, and that will be a real zero.
|
||||
k := j - 1
|
||||
c1 := text[k]
|
||||
if k > 0 {
|
||||
if c0 := text[k-1]; c0 < c1 {
|
||||
k = -k
|
||||
}
|
||||
}
|
||||
|
||||
if cB != c1 {
|
||||
bucket[cB] = b
|
||||
cB = c1
|
||||
b = bucket[cB]
|
||||
}
|
||||
sa[b] = int32(k)
|
||||
b++
|
||||
}
|
||||
}
|
||||
|
||||
func induceS_8_32(text []byte, sa, freq, bucket []int32) {
|
||||
// Initialize positions for right side of character buckets.
|
||||
bucketMax_8_32(text, freq, bucket)
|
||||
bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
|
||||
|
||||
cB := byte(0)
|
||||
b := bucket[cB]
|
||||
|
||||
for i := len(sa) - 1; i >= 0; i-- {
|
||||
j := int(sa[i])
|
||||
if j >= 0 {
|
||||
// Skip non-flagged entry.
|
||||
// (This loop can't see an empty entry; 0 means the real zero index.)
|
||||
continue
|
||||
}
|
||||
|
||||
// Negative j is a work queue entry; rewrite to positive j for final suffix array.
|
||||
j = -j
|
||||
sa[i] = int32(j)
|
||||
|
||||
// Index j was on work queue (encoded as -j but now decoded),
|
||||
// meaning k := j-1 is L-type,
|
||||
// so we can now place k correctly into sa.
|
||||
// If k-1 is S-type, queue -k for processing later in this loop.
|
||||
// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
|
||||
// If k is zero, k-1 doesn't exist, so we only need to leave it
|
||||
// for the caller.
|
||||
k := j - 1
|
||||
c1 := text[k]
|
||||
if k > 0 {
|
||||
if c0 := text[k-1]; c0 <= c1 {
|
||||
k = -k
|
||||
}
|
||||
}
|
||||
|
||||
if cB != c1 {
|
||||
bucket[cB] = b
|
||||
cB = c1
|
||||
b = bucket[cB]
|
||||
}
|
||||
b--
|
||||
sa[b] = int32(k)
|
||||
}
|
||||
}
|
||||
1733
src/index/suffixarray/sais2.go
Normal file
1733
src/index/suffixarray/sais2.go
Normal file
File diff suppressed because it is too large
Load Diff
382
src/index/suffixarray/suffixarray.go
Normal file
382
src/index/suffixarray/suffixarray.go
Normal file
@@ -0,0 +1,382 @@
|
||||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package suffixarray implements substring search in logarithmic time using
|
||||
// an in-memory suffix array.
|
||||
//
|
||||
// Example use:
|
||||
//
|
||||
// // create index for some data
|
||||
// index := suffixarray.New(data)
|
||||
//
|
||||
// // lookup byte slice s
|
||||
// offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
|
||||
// offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
|
||||
package suffixarray
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"io"
|
||||
"math"
|
||||
"regexp"
|
||||
"slices"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// Can change for testing
|
||||
var maxData32 int = realMaxData32
|
||||
|
||||
const realMaxData32 = math.MaxInt32
|
||||
|
||||
// Index implements a suffix array for fast substring search.
|
||||
type Index struct {
|
||||
data []byte
|
||||
sa ints // suffix array for data; sa.len() == len(data)
|
||||
}
|
||||
|
||||
// An ints is either an []int32 or an []int64.
|
||||
// That is, one of them is empty, and one is the real data.
|
||||
// The int64 form is used when len(data) > maxData32
|
||||
type ints struct {
|
||||
int32 []int32
|
||||
int64 []int64
|
||||
}
|
||||
|
||||
func (a *ints) len() int {
|
||||
return len(a.int32) + len(a.int64)
|
||||
}
|
||||
|
||||
func (a *ints) get(i int) int64 {
|
||||
if a.int32 != nil {
|
||||
return int64(a.int32[i])
|
||||
}
|
||||
return a.int64[i]
|
||||
}
|
||||
|
||||
func (a *ints) set(i int, v int64) {
|
||||
if a.int32 != nil {
|
||||
a.int32[i] = int32(v)
|
||||
} else {
|
||||
a.int64[i] = v
|
||||
}
|
||||
}
|
||||
|
||||
func (a *ints) slice(i, j int) ints {
|
||||
if a.int32 != nil {
|
||||
return ints{a.int32[i:j], nil}
|
||||
}
|
||||
return ints{nil, a.int64[i:j]}
|
||||
}
|
||||
|
||||
// New creates a new [Index] for data.
|
||||
// [Index] creation time is O(N) for N = len(data).
|
||||
func New(data []byte) *Index {
|
||||
ix := &Index{data: data}
|
||||
if len(data) <= maxData32 {
|
||||
ix.sa.int32 = make([]int32, len(data))
|
||||
text_32(data, ix.sa.int32)
|
||||
} else {
|
||||
ix.sa.int64 = make([]int64, len(data))
|
||||
text_64(data, ix.sa.int64)
|
||||
}
|
||||
return ix
|
||||
}
|
||||
|
||||
// writeInt writes an int x to w using buf to buffer the write.
|
||||
func writeInt(w io.Writer, buf []byte, x int) error {
|
||||
binary.PutVarint(buf, int64(x))
|
||||
_, err := w.Write(buf[0:binary.MaxVarintLen64])
|
||||
return err
|
||||
}
|
||||
|
||||
// readInt reads an int x from r using buf to buffer the read and returns x.
|
||||
func readInt(r io.Reader, buf []byte) (int64, error) {
|
||||
_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
|
||||
x, _ := binary.Varint(buf)
|
||||
return x, err
|
||||
}
|
||||
|
||||
// writeSlice writes data[:n] to w and returns n.
|
||||
// It uses buf to buffer the write.
|
||||
func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) {
|
||||
// encode as many elements as fit into buf
|
||||
p := binary.MaxVarintLen64
|
||||
m := data.len()
|
||||
for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ {
|
||||
p += binary.PutUvarint(buf[p:], uint64(data.get(n)))
|
||||
}
|
||||
|
||||
// update buffer size
|
||||
binary.PutVarint(buf, int64(p))
|
||||
|
||||
// write buffer
|
||||
_, err = w.Write(buf[0:p])
|
||||
return
|
||||
}
|
||||
|
||||
var errTooBig = errors.New("suffixarray: data too large")
|
||||
|
||||
// readSlice reads data[:n] from r and returns n.
|
||||
// It uses buf to buffer the read.
|
||||
func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) {
|
||||
// read buffer size
|
||||
var size64 int64
|
||||
size64, err = readInt(r, buf)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if int64(int(size64)) != size64 || int(size64) < 0 {
|
||||
// We never write chunks this big anyway.
|
||||
return 0, errTooBig
|
||||
}
|
||||
size := int(size64)
|
||||
|
||||
// read buffer w/o the size
|
||||
if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// decode as many elements as present in buf
|
||||
for p := binary.MaxVarintLen64; p < size; n++ {
|
||||
x, w := binary.Uvarint(buf[p:])
|
||||
data.set(n, int64(x))
|
||||
p += w
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
|
||||
|
||||
// Read reads the index from r into x; x must not be nil.
|
||||
func (x *Index) Read(r io.Reader) error {
|
||||
// buffer for all reads
|
||||
buf := make([]byte, bufSize)
|
||||
|
||||
// read length
|
||||
n64, err := readInt(r, buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if int64(int(n64)) != n64 || int(n64) < 0 {
|
||||
return errTooBig
|
||||
}
|
||||
n := int(n64)
|
||||
|
||||
// allocate space
|
||||
if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 {
|
||||
// new data is significantly smaller or larger than
|
||||
// existing buffers - allocate new ones
|
||||
x.data = make([]byte, n)
|
||||
x.sa.int32 = nil
|
||||
x.sa.int64 = nil
|
||||
if n <= maxData32 {
|
||||
x.sa.int32 = make([]int32, n)
|
||||
} else {
|
||||
x.sa.int64 = make([]int64, n)
|
||||
}
|
||||
} else {
|
||||
// re-use existing buffers
|
||||
x.data = x.data[0:n]
|
||||
x.sa = x.sa.slice(0, n)
|
||||
}
|
||||
|
||||
// read data
|
||||
if _, err := io.ReadFull(r, x.data); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// read index
|
||||
sa := x.sa
|
||||
for sa.len() > 0 {
|
||||
n, err := readSlice(r, buf, sa)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
sa = sa.slice(n, sa.len())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Write writes the index x to w.
|
||||
func (x *Index) Write(w io.Writer) error {
|
||||
// buffer for all writes
|
||||
buf := make([]byte, bufSize)
|
||||
|
||||
// write length
|
||||
if err := writeInt(w, buf, len(x.data)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// write data
|
||||
if _, err := w.Write(x.data); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// write index
|
||||
sa := x.sa
|
||||
for sa.len() > 0 {
|
||||
n, err := writeSlice(w, buf, sa)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
sa = sa.slice(n, sa.len())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Bytes returns the data over which the index was created.
|
||||
// It must not be modified.
|
||||
func (x *Index) Bytes() []byte {
|
||||
return x.data
|
||||
}
|
||||
|
||||
func (x *Index) at(i int) []byte {
|
||||
return x.data[x.sa.get(i):]
|
||||
}
|
||||
|
||||
// lookupAll returns a slice into the matching region of the index.
|
||||
// The runtime is O(log(N)*len(s)).
|
||||
func (x *Index) lookupAll(s []byte) ints {
|
||||
// find matching suffix index range [i:j]
|
||||
// find the first index where s would be the prefix
|
||||
i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
|
||||
// starting at i, find the first index at which s is not a prefix
|
||||
j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
|
||||
return x.sa.slice(i, j)
|
||||
}
|
||||
|
||||
// Lookup returns an unsorted list of at most n indices where the byte string s
|
||||
// occurs in the indexed data. If n < 0, all occurrences are returned.
|
||||
// The result is nil if s is empty, s is not found, or n == 0.
|
||||
// Lookup time is O(log(N)*len(s) + len(result)) where N is the
|
||||
// size of the indexed data.
|
||||
func (x *Index) Lookup(s []byte, n int) (result []int) {
|
||||
if len(s) > 0 && n != 0 {
|
||||
matches := x.lookupAll(s)
|
||||
count := matches.len()
|
||||
if n < 0 || count < n {
|
||||
n = count
|
||||
}
|
||||
// 0 <= n <= count
|
||||
if n > 0 {
|
||||
result = make([]int, n)
|
||||
if matches.int32 != nil {
|
||||
for i := range result {
|
||||
result[i] = int(matches.int32[i])
|
||||
}
|
||||
} else {
|
||||
for i := range result {
|
||||
result[i] = int(matches.int64[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// FindAllIndex returns a sorted list of non-overlapping matches of the
|
||||
// regular expression r, where a match is a pair of indices specifying
|
||||
// the matched slice of x.Bytes(). If n < 0, all matches are returned
|
||||
// in successive order. Otherwise, at most n matches are returned and
|
||||
// they may not be successive. The result is nil if there are no matches,
|
||||
// or if n == 0.
|
||||
func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
|
||||
// a non-empty literal prefix is used to determine possible
|
||||
// match start indices with Lookup
|
||||
prefix, complete := r.LiteralPrefix()
|
||||
lit := []byte(prefix)
|
||||
|
||||
// worst-case scenario: no literal prefix
|
||||
if prefix == "" {
|
||||
return r.FindAllIndex(x.data, n)
|
||||
}
|
||||
|
||||
// if regexp is a literal just use Lookup and convert its
|
||||
// result into match pairs
|
||||
if complete {
|
||||
// Lookup returns indices that may belong to overlapping matches.
|
||||
// After eliminating them, we may end up with fewer than n matches.
|
||||
// If we don't have enough at the end, redo the search with an
|
||||
// increased value n1, but only if Lookup returned all the requested
|
||||
// indices in the first place (if it returned fewer than that then
|
||||
// there cannot be more).
|
||||
for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
|
||||
indices := x.Lookup(lit, n1)
|
||||
if len(indices) == 0 {
|
||||
return
|
||||
}
|
||||
slices.Sort(indices)
|
||||
pairs := make([]int, 2*len(indices))
|
||||
result = make([][]int, len(indices))
|
||||
count := 0
|
||||
prev := 0
|
||||
for _, i := range indices {
|
||||
if count == n {
|
||||
break
|
||||
}
|
||||
// ignore indices leading to overlapping matches
|
||||
if prev <= i {
|
||||
j := 2 * count
|
||||
pairs[j+0] = i
|
||||
pairs[j+1] = i + len(lit)
|
||||
result[count] = pairs[j : j+2]
|
||||
count++
|
||||
prev = i + len(lit)
|
||||
}
|
||||
}
|
||||
result = result[0:count]
|
||||
if len(result) >= n || len(indices) != n1 {
|
||||
// found all matches or there's no chance to find more
|
||||
// (n and n1 can be negative)
|
||||
break
|
||||
}
|
||||
}
|
||||
if len(result) == 0 {
|
||||
result = nil
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// regexp has a non-empty literal prefix; Lookup(lit) computes
|
||||
// the indices of possible complete matches; use these as starting
|
||||
// points for anchored searches
|
||||
// (regexp "^" matches beginning of input, not beginning of line)
|
||||
r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
|
||||
|
||||
// same comment about Lookup applies here as in the loop above
|
||||
for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
|
||||
indices := x.Lookup(lit, n1)
|
||||
if len(indices) == 0 {
|
||||
return
|
||||
}
|
||||
slices.Sort(indices)
|
||||
result = result[0:0]
|
||||
prev := 0
|
||||
for _, i := range indices {
|
||||
if len(result) == n {
|
||||
break
|
||||
}
|
||||
m := r.FindIndex(x.data[i:]) // anchored search - will not run off
|
||||
// ignore indices leading to overlapping matches
|
||||
if m != nil && prev <= i {
|
||||
m[0] = i // correct m
|
||||
m[1] += i
|
||||
result = append(result, m)
|
||||
prev = m[1]
|
||||
}
|
||||
}
|
||||
if len(result) >= n || len(indices) != n1 {
|
||||
// found all matches or there's no chance to find more
|
||||
// (n and n1 can be negative)
|
||||
break
|
||||
}
|
||||
}
|
||||
if len(result) == 0 {
|
||||
result = nil
|
||||
}
|
||||
return
|
||||
}
|
||||
616
src/index/suffixarray/suffixarray_test.go
Normal file
616
src/index/suffixarray/suffixarray_test.go
Normal file
@@ -0,0 +1,616 @@
|
||||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package suffixarray
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"math/rand"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"slices"
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type testCase struct {
|
||||
name string // name of test case
|
||||
source string // source to index
|
||||
patterns []string // patterns to lookup
|
||||
}
|
||||
|
||||
var testCases = []testCase{
|
||||
{
|
||||
"empty string",
|
||||
"",
|
||||
[]string{
|
||||
"",
|
||||
"foo",
|
||||
"(foo)",
|
||||
".*",
|
||||
"a*",
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"all a's",
|
||||
"aaaaaaaaaa", // 10 a's
|
||||
[]string{
|
||||
"",
|
||||
"a",
|
||||
"aa",
|
||||
"aaa",
|
||||
"aaaa",
|
||||
"aaaaa",
|
||||
"aaaaaa",
|
||||
"aaaaaaa",
|
||||
"aaaaaaaa",
|
||||
"aaaaaaaaa",
|
||||
"aaaaaaaaaa",
|
||||
"aaaaaaaaaaa", // 11 a's
|
||||
".",
|
||||
".*",
|
||||
"a+",
|
||||
"aa+",
|
||||
"aaaa[b]?",
|
||||
"aaa*",
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"abc",
|
||||
"abc",
|
||||
[]string{
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"ab",
|
||||
"bc",
|
||||
"abc",
|
||||
"a.c",
|
||||
"a(b|c)",
|
||||
"abc?",
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"barbara*3",
|
||||
"barbarabarbarabarbara",
|
||||
[]string{
|
||||
"a",
|
||||
"bar",
|
||||
"rab",
|
||||
"arab",
|
||||
"barbar",
|
||||
"bara?bar",
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"typing drill",
|
||||
"Now is the time for all good men to come to the aid of their country.",
|
||||
[]string{
|
||||
"Now",
|
||||
"the time",
|
||||
"to come the aid",
|
||||
"is the time for all good men to come to the aid of their",
|
||||
"to (come|the)?",
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"godoc simulation",
|
||||
"package main\n\nimport(\n \"rand\"\n ",
|
||||
[]string{},
|
||||
},
|
||||
}
|
||||
|
||||
// find all occurrences of s in source; report at most n occurrences
|
||||
func find(src, s string, n int) []int {
|
||||
var res []int
|
||||
if s != "" && n != 0 {
|
||||
// find at most n occurrences of s in src
|
||||
for i := -1; n < 0 || len(res) < n; {
|
||||
j := strings.Index(src[i+1:], s)
|
||||
if j < 0 {
|
||||
break
|
||||
}
|
||||
i += j + 1
|
||||
res = append(res, i)
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
|
||||
res := x.Lookup([]byte(s), n)
|
||||
exp := find(tc.source, s, n)
|
||||
|
||||
// check that the lengths match
|
||||
if len(res) != len(exp) {
|
||||
t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res))
|
||||
}
|
||||
|
||||
// if n >= 0 the number of results is limited --- unless n >= all results,
|
||||
// we may obtain different positions from the Index and from find (because
|
||||
// Index may not find the results in the same order as find) => in general
|
||||
// we cannot simply check that the res and exp lists are equal
|
||||
|
||||
// check that each result is in fact a correct match and there are no duplicates
|
||||
slices.Sort(res)
|
||||
for i, r := range res {
|
||||
if r < 0 || len(tc.source) <= r {
|
||||
t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
|
||||
} else if !strings.HasPrefix(tc.source[r:], s) {
|
||||
t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
|
||||
}
|
||||
if i > 0 && res[i-1] == r {
|
||||
t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
|
||||
}
|
||||
}
|
||||
|
||||
if n < 0 {
|
||||
// all results computed - sorted res and exp must be equal
|
||||
for i, r := range res {
|
||||
e := exp[i]
|
||||
if r != e {
|
||||
t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
|
||||
res := x.FindAllIndex(rx, n)
|
||||
exp := rx.FindAllStringIndex(tc.source, n)
|
||||
|
||||
// check that the lengths match
|
||||
if len(res) != len(exp) {
|
||||
t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
|
||||
}
|
||||
|
||||
// if n >= 0 the number of results is limited --- unless n >= all results,
|
||||
// we may obtain different positions from the Index and from regexp (because
|
||||
// Index may not find the results in the same order as regexp) => in general
|
||||
// we cannot simply check that the res and exp lists are equal
|
||||
|
||||
// check that each result is in fact a correct match and the result is sorted
|
||||
for i, r := range res {
|
||||
if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
|
||||
t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
|
||||
} else if !rx.MatchString(tc.source[r[0]:r[1]]) {
|
||||
t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
|
||||
}
|
||||
}
|
||||
|
||||
if n < 0 {
|
||||
// all results computed - sorted res and exp must be equal
|
||||
for i, r := range res {
|
||||
e := exp[i]
|
||||
if r[0] != e[0] || r[1] != e[1] {
|
||||
t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
|
||||
tc.name, rx, i, e[0], e[1], r[0], r[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
|
||||
for _, pat := range tc.patterns {
|
||||
testLookup(t, tc, x, pat, n)
|
||||
if rx, err := regexp.Compile(pat); err == nil {
|
||||
testFindAllIndex(t, tc, x, rx, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// index is used to hide the sort.Interface
|
||||
type index Index
|
||||
|
||||
func (x *index) Len() int { return x.sa.len() }
|
||||
func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
|
||||
func (x *index) Swap(i, j int) {
|
||||
if x.sa.int32 != nil {
|
||||
x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i]
|
||||
} else {
|
||||
x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i]
|
||||
}
|
||||
}
|
||||
|
||||
func (x *index) at(i int) []byte {
|
||||
return x.data[x.sa.get(i):]
|
||||
}
|
||||
|
||||
func testConstruction(t *testing.T, tc *testCase, x *Index) {
|
||||
if !sort.IsSorted((*index)(x)) {
|
||||
t.Errorf("failed testConstruction %s", tc.name)
|
||||
}
|
||||
}
|
||||
|
||||
func equal(x, y *Index) bool {
|
||||
if !bytes.Equal(x.data, y.data) {
|
||||
return false
|
||||
}
|
||||
if x.sa.len() != y.sa.len() {
|
||||
return false
|
||||
}
|
||||
n := x.sa.len()
|
||||
for i := 0; i < n; i++ {
|
||||
if x.sa.get(i) != y.sa.get(i) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// returns the serialized index size
|
||||
func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
|
||||
var buf bytes.Buffer
|
||||
if err := x.Write(&buf); err != nil {
|
||||
t.Errorf("failed writing index %s (%s)", tc.name, err)
|
||||
}
|
||||
size := buf.Len()
|
||||
var y Index
|
||||
if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
|
||||
t.Errorf("failed reading index %s (%s)", tc.name, err)
|
||||
}
|
||||
if !equal(x, &y) {
|
||||
t.Errorf("restored index doesn't match saved index %s", tc.name)
|
||||
}
|
||||
|
||||
old := maxData32
|
||||
defer func() {
|
||||
maxData32 = old
|
||||
}()
|
||||
// Reread as forced 32.
|
||||
y = Index{}
|
||||
maxData32 = realMaxData32
|
||||
if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
|
||||
t.Errorf("failed reading index %s (%s)", tc.name, err)
|
||||
}
|
||||
if !equal(x, &y) {
|
||||
t.Errorf("restored index doesn't match saved index %s", tc.name)
|
||||
}
|
||||
|
||||
// Reread as forced 64.
|
||||
y = Index{}
|
||||
maxData32 = -1
|
||||
if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
|
||||
t.Errorf("failed reading index %s (%s)", tc.name, err)
|
||||
}
|
||||
if !equal(x, &y) {
|
||||
t.Errorf("restored index doesn't match saved index %s", tc.name)
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
func testIndex(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
x := New([]byte(tc.source))
|
||||
testConstruction(t, &tc, x)
|
||||
testSaveRestore(t, &tc, x)
|
||||
testLookups(t, &tc, x, 0)
|
||||
testLookups(t, &tc, x, 1)
|
||||
testLookups(t, &tc, x, 10)
|
||||
testLookups(t, &tc, x, 2e9)
|
||||
testLookups(t, &tc, x, -1)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIndex32(t *testing.T) {
|
||||
testIndex(t)
|
||||
}
|
||||
|
||||
func TestIndex64(t *testing.T) {
|
||||
maxData32 = -1
|
||||
defer func() {
|
||||
maxData32 = realMaxData32
|
||||
}()
|
||||
testIndex(t)
|
||||
}
|
||||
|
||||
func TestNew32(t *testing.T) {
|
||||
test(t, func(x []byte) []int {
|
||||
sa := make([]int32, len(x))
|
||||
text_32(x, sa)
|
||||
out := make([]int, len(sa))
|
||||
for i, v := range sa {
|
||||
out[i] = int(v)
|
||||
}
|
||||
return out
|
||||
})
|
||||
}
|
||||
|
||||
func TestNew64(t *testing.T) {
|
||||
test(t, func(x []byte) []int {
|
||||
sa := make([]int64, len(x))
|
||||
text_64(x, sa)
|
||||
out := make([]int, len(sa))
|
||||
for i, v := range sa {
|
||||
out[i] = int(v)
|
||||
}
|
||||
return out
|
||||
})
|
||||
}
|
||||
|
||||
// test tests an arbitrary suffix array construction function.
|
||||
// Generates many inputs, builds and checks suffix arrays.
|
||||
func test(t *testing.T, build func([]byte) []int) {
|
||||
t.Run("ababab...", func(t *testing.T) {
|
||||
// Very repetitive input has numLMS = len(x)/2-1
|
||||
// at top level, the largest it can be.
|
||||
// But maxID is only two (aba and ab$).
|
||||
size := 100000
|
||||
if testing.Short() {
|
||||
size = 10000
|
||||
}
|
||||
x := make([]byte, size)
|
||||
for i := range x {
|
||||
x[i] = "ab"[i%2]
|
||||
}
|
||||
testSA(t, x, build)
|
||||
})
|
||||
|
||||
t.Run("forcealloc", func(t *testing.T) {
|
||||
// Construct a pathological input that forces
|
||||
// recurse_32 to allocate a new temporary buffer.
|
||||
// The input must have more than N/3 LMS-substrings,
|
||||
// which we arrange by repeating an SLSLSLSLSLSL pattern
|
||||
// like ababab... above, but then we must also arrange
|
||||
// for a large number of distinct LMS-substrings.
|
||||
// We use this pattern:
|
||||
// 1 255 1 254 1 253 1 ... 1 2 1 255 2 254 2 253 2 252 2 ...
|
||||
// This gives approximately 2¹⁵ distinct LMS-substrings.
|
||||
// We need to repeat at least one substring, though,
|
||||
// or else the recursion can be bypassed entirely.
|
||||
x := make([]byte, 100000, 100001)
|
||||
lo := byte(1)
|
||||
hi := byte(255)
|
||||
for i := range x {
|
||||
if i%2 == 0 {
|
||||
x[i] = lo
|
||||
} else {
|
||||
x[i] = hi
|
||||
hi--
|
||||
if hi <= lo {
|
||||
lo++
|
||||
if lo == 0 {
|
||||
lo = 1
|
||||
}
|
||||
hi = 255
|
||||
}
|
||||
}
|
||||
}
|
||||
x[:cap(x)][len(x)] = 0 // for sais.New
|
||||
testSA(t, x, build)
|
||||
})
|
||||
|
||||
t.Run("exhaustive2", func(t *testing.T) {
|
||||
// All inputs over {0,1} up to length 21.
|
||||
// Runs in about 10 seconds on my laptop.
|
||||
x := make([]byte, 30)
|
||||
numFail := 0
|
||||
for n := 0; n <= 21; n++ {
|
||||
if n > 12 && testing.Short() {
|
||||
break
|
||||
}
|
||||
x[n] = 0 // for sais.New
|
||||
testRec(t, x[:n], 0, 2, &numFail, build)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("exhaustive3", func(t *testing.T) {
|
||||
// All inputs over {0,1,2} up to length 14.
|
||||
// Runs in about 10 seconds on my laptop.
|
||||
x := make([]byte, 30)
|
||||
numFail := 0
|
||||
for n := 0; n <= 14; n++ {
|
||||
if n > 8 && testing.Short() {
|
||||
break
|
||||
}
|
||||
x[n] = 0 // for sais.New
|
||||
testRec(t, x[:n], 0, 3, &numFail, build)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// testRec fills x[i:] with all possible combinations of values in [1,max]
|
||||
// and then calls testSA(t, x, build) for each one.
|
||||
func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) {
|
||||
if i < len(x) {
|
||||
for x[i] = 1; x[i] <= byte(max); x[i]++ {
|
||||
testRec(t, x, i+1, max, numFail, build)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if !testSA(t, x, build) {
|
||||
*numFail++
|
||||
if *numFail >= 10 {
|
||||
t.Errorf("stopping after %d failures", *numFail)
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// testSA tests the suffix array build function on the input x.
|
||||
// It constructs the suffix array and then checks that it is correct.
|
||||
func testSA(t *testing.T, x []byte, build func([]byte) []int) bool {
|
||||
defer func() {
|
||||
if e := recover(); e != nil {
|
||||
t.Logf("build %v", x)
|
||||
panic(e)
|
||||
}
|
||||
}()
|
||||
sa := build(x)
|
||||
if len(sa) != len(x) {
|
||||
t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x))
|
||||
return false
|
||||
}
|
||||
for i := 0; i+1 < len(sa); i++ {
|
||||
if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) {
|
||||
t.Errorf("build %s: sa out of range: %v\n", x, sa)
|
||||
return false
|
||||
}
|
||||
if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 {
|
||||
t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1])
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
var (
|
||||
benchdata = make([]byte, 1e6)
|
||||
benchrand = make([]byte, 1e6)
|
||||
)
|
||||
|
||||
// Of all possible inputs, the random bytes have the least amount of substring
|
||||
// repetition, and the repeated bytes have the most. For most algorithms,
|
||||
// the running time of every input will be between these two.
|
||||
func benchmarkNew(b *testing.B, random bool) {
|
||||
b.ReportAllocs()
|
||||
b.StopTimer()
|
||||
data := benchdata
|
||||
if random {
|
||||
data = benchrand
|
||||
if data[0] == 0 {
|
||||
for i := range data {
|
||||
data[i] = byte(rand.Intn(256))
|
||||
}
|
||||
}
|
||||
}
|
||||
b.StartTimer()
|
||||
b.SetBytes(int64(len(data)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
New(data)
|
||||
}
|
||||
}
|
||||
|
||||
func makeText(name string) ([]byte, error) {
|
||||
var data []byte
|
||||
switch name {
|
||||
case "opticks":
|
||||
var err error
|
||||
data, err = os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
case "go":
|
||||
err := filepath.WalkDir("../..", func(path string, info fs.DirEntry, err error) error {
|
||||
if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() {
|
||||
file, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
data = append(data, file...)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
case "zero":
|
||||
data = make([]byte, 50e6)
|
||||
case "rand":
|
||||
data = make([]byte, 50e6)
|
||||
for i := range data {
|
||||
data[i] = byte(rand.Intn(256))
|
||||
}
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func setBits(bits int) (cleanup func()) {
|
||||
if bits == 32 {
|
||||
maxData32 = realMaxData32
|
||||
} else {
|
||||
maxData32 = -1 // force use of 64-bit code
|
||||
}
|
||||
return func() {
|
||||
maxData32 = realMaxData32
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNew(b *testing.B) {
|
||||
for _, text := range []string{"opticks", "go", "zero", "rand"} {
|
||||
b.Run("text="+text, func(b *testing.B) {
|
||||
data, err := makeText(text)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
if testing.Short() && len(data) > 5e6 {
|
||||
data = data[:5e6]
|
||||
}
|
||||
for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} {
|
||||
if len(data) < size {
|
||||
continue
|
||||
}
|
||||
data := data[:size]
|
||||
name := fmt.Sprintf("%dK", size/1e3)
|
||||
if size >= 1e6 {
|
||||
name = fmt.Sprintf("%dM", size/1e6)
|
||||
}
|
||||
b.Run("size="+name, func(b *testing.B) {
|
||||
for _, bits := range []int{32, 64} {
|
||||
if ^uint(0) == 0xffffffff && bits == 64 {
|
||||
continue
|
||||
}
|
||||
b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
|
||||
cleanup := setBits(bits)
|
||||
defer cleanup()
|
||||
|
||||
b.SetBytes(int64(len(data)))
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
New(data)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSaveRestore(b *testing.B) {
|
||||
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
|
||||
data := make([]byte, 1<<20) // 1MB of data to index
|
||||
for i := range data {
|
||||
data[i] = byte(r.Intn(256))
|
||||
}
|
||||
for _, bits := range []int{32, 64} {
|
||||
if ^uint(0) == 0xffffffff && bits == 64 {
|
||||
continue
|
||||
}
|
||||
b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
|
||||
cleanup := setBits(bits)
|
||||
defer cleanup()
|
||||
|
||||
b.StopTimer()
|
||||
x := New(data)
|
||||
size := testSaveRestore(nil, nil, x) // verify correctness
|
||||
buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
|
||||
b.SetBytes(int64(size))
|
||||
b.StartTimer()
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
buf.Reset()
|
||||
if err := x.Write(buf); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
var y Index
|
||||
if err := y.Read(buf); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user