Initial commit: Go 1.23 release state

This commit is contained in:
Vorapol Rinsatitnon
2024-09-21 23:49:08 +10:00
commit 17cd57a668
13231 changed files with 3114330 additions and 0 deletions

976
src/regexp/all_test.go Normal file
View File

@@ -0,0 +1,976 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"reflect"
"regexp/syntax"
"slices"
"strings"
"testing"
"unicode/utf8"
)
var goodRe = []string{
``,
`.`,
`^.$`,
`a`,
`a*`,
`a+`,
`a?`,
`a|b`,
`a*|b*`,
`(a*|b)(c*|d)`,
`[a-z]`,
`[a-abc-c\-\]\[]`,
`[a-z]+`,
`[abc]`,
`[^1234]`,
`[^\n]`,
`\!\\`,
}
type stringError struct {
re string
err string
}
var badRe = []stringError{
{`*`, "missing argument to repetition operator: `*`"},
{`+`, "missing argument to repetition operator: `+`"},
{`?`, "missing argument to repetition operator: `?`"},
{`(abc`, "missing closing ): `(abc`"},
{`abc)`, "unexpected ): `abc)`"},
{`x[a-z`, "missing closing ]: `[a-z`"},
{`[z-a]`, "invalid character class range: `z-a`"},
{`abc\`, "trailing backslash at end of expression"},
{`a**`, "invalid nested repetition operator: `**`"},
{`a*+`, "invalid nested repetition operator: `*+`"},
{`\x`, "invalid escape sequence: `\\x`"},
{strings.Repeat(`\pL`, 27000), "expression too large"},
}
func compileTest(t *testing.T, expr string, error string) *Regexp {
re, err := Compile(expr)
if error == "" && err != nil {
t.Error("compiling `", expr, "`; unexpected error: ", err.Error())
}
if error != "" && err == nil {
t.Error("compiling `", expr, "`; missing error")
} else if error != "" && !strings.Contains(err.Error(), error) {
t.Error("compiling `", expr, "`; wrong error: ", err.Error(), "; want ", error)
}
return re
}
func TestGoodCompile(t *testing.T) {
for i := 0; i < len(goodRe); i++ {
compileTest(t, goodRe[i], "")
}
}
func TestBadCompile(t *testing.T) {
for i := 0; i < len(badRe); i++ {
compileTest(t, badRe[i].re, badRe[i].err)
}
}
func matchTest(t *testing.T, test *FindTest) {
re := compileTest(t, test.pat, "")
if re == nil {
return
}
m := re.MatchString(test.text)
if m != (len(test.matches) > 0) {
t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0)
}
// now try bytes
m = re.Match([]byte(test.text))
if m != (len(test.matches) > 0) {
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
}
}
func TestMatch(t *testing.T) {
for _, test := range findTests {
matchTest(t, &test)
}
}
func matchFunctionTest(t *testing.T, test *FindTest) {
m, err := MatchString(test.pat, test.text)
if err == nil {
return
}
if m != (len(test.matches) > 0) {
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
}
}
func TestMatchFunction(t *testing.T) {
for _, test := range findTests {
matchFunctionTest(t, &test)
}
}
func copyMatchTest(t *testing.T, test *FindTest) {
re := compileTest(t, test.pat, "")
if re == nil {
return
}
m1 := re.MatchString(test.text)
m2 := re.Copy().MatchString(test.text)
if m1 != m2 {
t.Errorf("Copied Regexp match failure on %s: original gave %t; copy gave %t; should be %t",
test, m1, m2, len(test.matches) > 0)
}
}
func TestCopyMatch(t *testing.T) {
for _, test := range findTests {
copyMatchTest(t, &test)
}
}
type ReplaceTest struct {
pattern, replacement, input, output string
}
var replaceTests = []ReplaceTest{
// Test empty input and/or replacement, with pattern that matches the empty string.
{"", "", "", ""},
{"", "x", "", "x"},
{"", "", "abc", "abc"},
{"", "x", "abc", "xaxbxcx"},
// Test empty input and/or replacement, with pattern that does not match the empty string.
{"b", "", "", ""},
{"b", "x", "", ""},
{"b", "", "abc", "ac"},
{"b", "x", "abc", "axc"},
{"y", "", "", ""},
{"y", "x", "", ""},
{"y", "", "abc", "abc"},
{"y", "x", "abc", "abc"},
// Multibyte characters -- verify that we don't try to match in the middle
// of a character.
{"[a-c]*", "x", "\u65e5", "x\u65e5x"},
{"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"},
// Start and end of a string.
{"^[a-c]*", "x", "abcdabc", "xdabc"},
{"[a-c]*$", "x", "abcdabc", "abcdx"},
{"^[a-c]*$", "x", "abcdabc", "abcdabc"},
{"^[a-c]*", "x", "abc", "x"},
{"[a-c]*$", "x", "abc", "x"},
{"^[a-c]*$", "x", "abc", "x"},
{"^[a-c]*", "x", "dabce", "xdabce"},
{"[a-c]*$", "x", "dabce", "dabcex"},
{"^[a-c]*$", "x", "dabce", "dabce"},
{"^[a-c]*", "x", "", "x"},
{"[a-c]*$", "x", "", "x"},
{"^[a-c]*$", "x", "", "x"},
{"^[a-c]+", "x", "abcdabc", "xdabc"},
{"[a-c]+$", "x", "abcdabc", "abcdx"},
{"^[a-c]+$", "x", "abcdabc", "abcdabc"},
{"^[a-c]+", "x", "abc", "x"},
{"[a-c]+$", "x", "abc", "x"},
{"^[a-c]+$", "x", "abc", "x"},
{"^[a-c]+", "x", "dabce", "dabce"},
{"[a-c]+$", "x", "dabce", "dabce"},
{"^[a-c]+$", "x", "dabce", "dabce"},
{"^[a-c]+", "x", "", ""},
{"[a-c]+$", "x", "", ""},
{"^[a-c]+$", "x", "", ""},
// Other cases.
{"abc", "def", "abcdefg", "defdefg"},
{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"},
{"abc", "", "abcdabc", "d"},
{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"},
{"abc", "d", "", ""},
{"abc", "d", "abc", "d"},
{".+", "x", "abc", "x"},
{"[a-c]*", "x", "def", "xdxexfx"},
{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"},
{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"},
// Substitutions
{"a+", "($0)", "banana", "b(a)n(a)n(a)"},
{"a+", "(${0})", "banana", "b(a)n(a)n(a)"},
{"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"},
{"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"},
{"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"},
{"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "},
{"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"},
{"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<hello, world><world><><>"},
{"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"},
{"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"},
{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "hihihi"},
{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "byebyebye"},
{"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", ""},
{"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "hiyz"},
{"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $x"},
{"a+", "${oops", "aaa", "${oops"},
{"a+", "$$", "aaa", "$"},
{"a+", "$", "aaa", "$"},
// Substitution when subexpression isn't found
{"(x)?", "$1", "123", "123"},
{"abc", "$1", "123", "123"},
// Substitutions involving a (x){0}
{"(a)(b){0}(c)", ".$1|$3.", "xacxacx", "x.a|c.x.a|c.x"},
{"(a)(((b))){0}c", ".$1.", "xacxacx", "x.a.x.a.x"},
{"((a(b){0}){3}){5}(h)", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"},
{"((a(b){0}){3}){5}h", "y caramb$2", "say aaaaaaaaaaaaaaaah", "say ay caramba"},
}
var replaceLiteralTests = []ReplaceTest{
// Substitutions
{"a+", "($0)", "banana", "b($0)n($0)n($0)"},
{"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"},
{"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"},
{"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"},
{"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"},
{"hello, (?P<noun>.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"},
{"hello, (?P<noun>.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"},
{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "hi", "$x$x$x"},
{"(?P<x>hi)|(?P<x>bye)", "$x$x$x", "bye", "$x$x$x"},
{"(?P<x>hi)|(?P<x>bye)", "$xyz", "hi", "$xyz"},
{"(?P<x>hi)|(?P<x>bye)", "${x}yz", "hi", "${x}yz"},
{"(?P<x>hi)|(?P<x>bye)", "hello $$x", "hi", "hello $$x"},
{"a+", "${oops", "aaa", "${oops"},
{"a+", "$$", "aaa", "$$"},
{"a+", "$", "aaa", "$"},
}
type ReplaceFuncTest struct {
pattern string
replacement func(string) string
input, output string
}
var replaceFuncTests = []ReplaceFuncTest{
{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"},
{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"},
{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"},
}
func TestReplaceAll(t *testing.T) {
for _, tc := range replaceTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
continue
}
actual := re.ReplaceAllString(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
// now try bytes
actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement)))
if actual != tc.output {
t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
}
}
func TestReplaceAllLiteral(t *testing.T) {
// Run ReplaceAll tests that do not have $ expansions.
for _, tc := range replaceTests {
if strings.Contains(tc.replacement, "$") {
continue
}
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
continue
}
actual := re.ReplaceAllLiteralString(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
// now try bytes
actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement)))
if actual != tc.output {
t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
}
// Run literal-specific tests.
for _, tc := range replaceLiteralTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
continue
}
actual := re.ReplaceAllLiteralString(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
// now try bytes
actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement)))
if actual != tc.output {
t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
}
}
func TestReplaceAllFunc(t *testing.T) {
for _, tc := range replaceFuncTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
continue
}
actual := re.ReplaceAllStringFunc(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q",
tc.pattern, tc.input, actual, tc.output)
}
// now try bytes
actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) }))
if actual != tc.output {
t.Errorf("%q.ReplaceFunc(%q,fn) = %q; want %q",
tc.pattern, tc.input, actual, tc.output)
}
}
}
type MetaTest struct {
pattern, output, literal string
isLiteral bool
}
var metaTests = []MetaTest{
{``, ``, ``, true},
{`foo`, `foo`, `foo`, true},
{`日本語+`, `日本語\+`, `日本語`, false},
{`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator
{`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators
{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false},
}
var literalPrefixTests = []MetaTest{
// See golang.org/issue/11175.
// output is unused.
{`^0^0$`, ``, `0`, false},
{`^0^`, ``, ``, false},
{`^0$`, ``, `0`, true},
{`$0^`, ``, ``, false},
{`$0$`, ``, ``, false},
{`^^0$$`, ``, ``, false},
{`^$^$`, ``, ``, false},
{`$$0^^`, ``, ``, false},
{`a\x{fffd}b`, ``, `a`, false},
{`\x{fffd}b`, ``, ``, false},
{"\ufffd", ``, ``, false},
}
func TestQuoteMeta(t *testing.T) {
for _, tc := range metaTests {
// Verify that QuoteMeta returns the expected string.
quoted := QuoteMeta(tc.pattern)
if quoted != tc.output {
t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`",
tc.pattern, quoted, tc.output)
continue
}
// Verify that the quoted string is in fact treated as expected
// by Compile -- i.e. that it matches the original, unquoted string.
if tc.pattern != "" {
re, err := Compile(quoted)
if err != nil {
t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err)
continue
}
src := "abc" + tc.pattern + "def"
repl := "xyz"
replaced := re.ReplaceAllString(src, repl)
expected := "abcxyzdef"
if replaced != expected {
t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`",
tc.pattern, src, repl, replaced, expected)
}
}
}
}
func TestLiteralPrefix(t *testing.T) {
for _, tc := range append(metaTests, literalPrefixTests...) {
// Literal method needs to scan the pattern.
re := MustCompile(tc.pattern)
str, complete := re.LiteralPrefix()
if complete != tc.isLiteral {
t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral)
}
if str != tc.literal {
t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal)
}
}
}
type subexpIndex struct {
name string
index int
}
type subexpCase struct {
input string
num int
names []string
indices []subexpIndex
}
var emptySubexpIndices = []subexpIndex{{"", -1}, {"missing", -1}}
var subexpCases = []subexpCase{
{``, 0, nil, emptySubexpIndices},
{`.*`, 0, nil, emptySubexpIndices},
{`abba`, 0, nil, emptySubexpIndices},
{`ab(b)a`, 1, []string{"", ""}, emptySubexpIndices},
{`ab(.*)a`, 1, []string{"", ""}, emptySubexpIndices},
{`(.*)ab(.*)a`, 2, []string{"", "", ""}, emptySubexpIndices},
{`(.*)(ab)(.*)a`, 3, []string{"", "", "", ""}, emptySubexpIndices},
{`(.*)((a)b)(.*)a`, 4, []string{"", "", "", "", ""}, emptySubexpIndices},
{`(.*)(\(ab)(.*)a`, 3, []string{"", "", "", ""}, emptySubexpIndices},
{`(.*)(\(a\)b)(.*)a`, 3, []string{"", "", "", ""}, emptySubexpIndices},
{`(?P<foo>.*)(?P<bar>(a)b)(?P<foo>.*)a`, 4, []string{"", "foo", "bar", "", "foo"}, []subexpIndex{{"", -1}, {"missing", -1}, {"foo", 1}, {"bar", 2}}},
}
func TestSubexp(t *testing.T) {
for _, c := range subexpCases {
re := MustCompile(c.input)
n := re.NumSubexp()
if n != c.num {
t.Errorf("%q: NumSubexp = %d, want %d", c.input, n, c.num)
continue
}
names := re.SubexpNames()
if len(names) != 1+n {
t.Errorf("%q: len(SubexpNames) = %d, want %d", c.input, len(names), n)
continue
}
if c.names != nil {
for i := 0; i < 1+n; i++ {
if names[i] != c.names[i] {
t.Errorf("%q: SubexpNames[%d] = %q, want %q", c.input, i, names[i], c.names[i])
}
}
}
for _, subexp := range c.indices {
index := re.SubexpIndex(subexp.name)
if index != subexp.index {
t.Errorf("%q: SubexpIndex(%q) = %d, want %d", c.input, subexp.name, index, subexp.index)
}
}
}
}
var splitTests = []struct {
s string
r string
n int
out []string
}{
{"foo:and:bar", ":", -1, []string{"foo", "and", "bar"}},
{"foo:and:bar", ":", 1, []string{"foo:and:bar"}},
{"foo:and:bar", ":", 2, []string{"foo", "and:bar"}},
{"foo:and:bar", "foo", -1, []string{"", ":and:bar"}},
{"foo:and:bar", "bar", -1, []string{"foo:and:", ""}},
{"foo:and:bar", "baz", -1, []string{"foo:and:bar"}},
{"baabaab", "a", -1, []string{"b", "", "b", "", "b"}},
{"baabaab", "a*", -1, []string{"b", "b", "b"}},
{"baabaab", "ba*", -1, []string{"", "", "", ""}},
{"foobar", "f*b*", -1, []string{"", "o", "o", "a", "r"}},
{"foobar", "f+.*b+", -1, []string{"", "ar"}},
{"foobooboar", "o{2}", -1, []string{"f", "b", "boar"}},
{"a,b,c,d,e,f", ",", 3, []string{"a", "b", "c,d,e,f"}},
{"a,b,c,d,e,f", ",", 0, nil},
{",", ",", -1, []string{"", ""}},
{",,,", ",", -1, []string{"", "", "", ""}},
{"", ",", -1, []string{""}},
{"", ".*", -1, []string{""}},
{"", ".+", -1, []string{""}},
{"", "", -1, []string{}},
{"foobar", "", -1, []string{"f", "o", "o", "b", "a", "r"}},
{"abaabaccadaaae", "a*", 5, []string{"", "b", "b", "c", "cadaaae"}},
{":x:y:z:", ":", -1, []string{"", "x", "y", "z", ""}},
}
func TestSplit(t *testing.T) {
for i, test := range splitTests {
re, err := Compile(test.r)
if err != nil {
t.Errorf("#%d: %q: compile error: %s", i, test.r, err.Error())
continue
}
split := re.Split(test.s, test.n)
if !slices.Equal(split, test.out) {
t.Errorf("#%d: %q: got %q; want %q", i, test.r, split, test.out)
}
if QuoteMeta(test.r) == test.r {
strsplit := strings.SplitN(test.s, test.r, test.n)
if !slices.Equal(split, strsplit) {
t.Errorf("#%d: Split(%q, %q, %d): regexp vs strings mismatch\nregexp=%q\nstrings=%q", i, test.s, test.r, test.n, split, strsplit)
}
}
}
}
// The following sequence of Match calls used to panic. See issue #12980.
func TestParseAndCompile(t *testing.T) {
expr := "a$"
s := "a\nb"
for i, tc := range []struct {
reFlags syntax.Flags
expMatch bool
}{
{syntax.Perl | syntax.OneLine, false},
{syntax.Perl &^ syntax.OneLine, true},
} {
parsed, err := syntax.Parse(expr, tc.reFlags)
if err != nil {
t.Fatalf("%d: parse: %v", i, err)
}
re, err := Compile(parsed.String())
if err != nil {
t.Fatalf("%d: compile: %v", i, err)
}
if match := re.MatchString(s); match != tc.expMatch {
t.Errorf("%d: %q.MatchString(%q)=%t; expected=%t", i, re, s, match, tc.expMatch)
}
}
}
// Check that one-pass cutoff does trigger.
func TestOnePassCutoff(t *testing.T) {
re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl)
if err != nil {
t.Fatalf("parse: %v", err)
}
p, err := syntax.Compile(re.Simplify())
if err != nil {
t.Fatalf("compile: %v", err)
}
if compileOnePass(p) != nil {
t.Fatalf("makeOnePass succeeded; wanted nil")
}
}
// Check that the same machine can be used with the standard matcher
// and then the backtracker when there are no captures.
func TestSwitchBacktrack(t *testing.T) {
re := MustCompile(`a|b`)
long := make([]byte, maxBacktrackVector+1)
// The following sequence of Match calls used to panic. See issue #10319.
re.Match(long) // triggers standard matcher
re.Match(long[:1]) // triggers backtracker
}
func BenchmarkFind(b *testing.B) {
b.StopTimer()
re := MustCompile("a+b+")
wantSubs := "aaabb"
s := []byte("acbb" + wantSubs + "dd")
b.StartTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
subs := re.Find(s)
if string(subs) != wantSubs {
b.Fatalf("Find(%q) = %q; want %q", s, subs, wantSubs)
}
}
}
func BenchmarkFindAllNoMatches(b *testing.B) {
re := MustCompile("a+b+")
s := []byte("acddee")
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
all := re.FindAll(s, -1)
if all != nil {
b.Fatalf("FindAll(%q) = %q; want nil", s, all)
}
}
}
func BenchmarkFindString(b *testing.B) {
b.StopTimer()
re := MustCompile("a+b+")
wantSubs := "aaabb"
s := "acbb" + wantSubs + "dd"
b.StartTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
subs := re.FindString(s)
if subs != wantSubs {
b.Fatalf("FindString(%q) = %q; want %q", s, subs, wantSubs)
}
}
}
func BenchmarkFindSubmatch(b *testing.B) {
b.StopTimer()
re := MustCompile("a(a+b+)b")
wantSubs := "aaabb"
s := []byte("acbb" + wantSubs + "dd")
b.StartTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
subs := re.FindSubmatch(s)
if string(subs[0]) != wantSubs {
b.Fatalf("FindSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs)
}
if string(subs[1]) != "aab" {
b.Fatalf("FindSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab")
}
}
}
func BenchmarkFindStringSubmatch(b *testing.B) {
b.StopTimer()
re := MustCompile("a(a+b+)b")
wantSubs := "aaabb"
s := "acbb" + wantSubs + "dd"
b.StartTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
subs := re.FindStringSubmatch(s)
if subs[0] != wantSubs {
b.Fatalf("FindStringSubmatch(%q)[0] = %q; want %q", s, subs[0], wantSubs)
}
if subs[1] != "aab" {
b.Fatalf("FindStringSubmatch(%q)[1] = %q; want %q", s, subs[1], "aab")
}
}
}
func BenchmarkLiteral(b *testing.B) {
x := strings.Repeat("x", 50) + "y"
b.StopTimer()
re := MustCompile("y")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
b.Fatalf("no match!")
}
}
}
func BenchmarkNotLiteral(b *testing.B) {
x := strings.Repeat("x", 50) + "y"
b.StopTimer()
re := MustCompile(".y")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
b.Fatalf("no match!")
}
}
}
func BenchmarkMatchClass(b *testing.B) {
b.StopTimer()
x := strings.Repeat("xxxx", 20) + "w"
re := MustCompile("[abcdw]")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
b.Fatalf("no match!")
}
}
}
func BenchmarkMatchClass_InRange(b *testing.B) {
b.StopTimer()
// 'b' is between 'a' and 'c', so the charclass
// range checking is no help here.
x := strings.Repeat("bbbb", 20) + "c"
re := MustCompile("[ac]")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
b.Fatalf("no match!")
}
}
}
func BenchmarkReplaceAll(b *testing.B) {
x := "abcdefghijklmnopqrstuvwxyz"
b.StopTimer()
re := MustCompile("[cjrw]")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.ReplaceAllString(x, "")
}
}
func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^zbc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
}
re := MustCompile("^zbc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredShortMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^.bc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredLongMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
}
re := MustCompile("^.bc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkOnePassShortA(b *testing.B) {
b.StopTimer()
x := []byte("abcddddddeeeededd")
re := MustCompile("^.bc(d|e)*$")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkNotOnePassShortA(b *testing.B) {
b.StopTimer()
x := []byte("abcddddddeeeededd")
re := MustCompile(".bc(d|e)*$")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkOnePassShortB(b *testing.B) {
b.StopTimer()
x := []byte("abcddddddeeeededd")
re := MustCompile("^.bc(?:d|e)*$")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkNotOnePassShortB(b *testing.B) {
b.StopTimer()
x := []byte("abcddddddeeeededd")
re := MustCompile(".bc(?:d|e)*$")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkOnePassLongPrefix(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^abcdefghijklmnopqrstuvwxyz.*$")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkOnePassLongNotPrefix(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^.bcdefghijklmnopqrstuvwxyz.*$")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkMatchParallelShared(b *testing.B) {
x := []byte("this is a long line that contains foo bar baz")
re := MustCompile("foo (ba+r)? baz")
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
re.Match(x)
}
})
}
func BenchmarkMatchParallelCopied(b *testing.B) {
x := []byte("this is a long line that contains foo bar baz")
re := MustCompile("foo (ba+r)? baz")
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
re := re.Copy()
for pb.Next() {
re.Match(x)
}
})
}
var sink string
func BenchmarkQuoteMetaAll(b *testing.B) {
specials := make([]byte, 0)
for i := byte(0); i < utf8.RuneSelf; i++ {
if special(i) {
specials = append(specials, i)
}
}
s := string(specials)
b.SetBytes(int64(len(s)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
sink = QuoteMeta(s)
}
}
func BenchmarkQuoteMetaNone(b *testing.B) {
s := "abcdefghijklmnopqrstuvwxyz"
b.SetBytes(int64(len(s)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
sink = QuoteMeta(s)
}
}
var compileBenchData = []struct{ name, re string }{
{"Onepass", `^a.[l-nA-Cg-j]?e$`},
{"Medium", `^((a|b|[d-z0-9])*(日){4,5}.)+$`},
{"Hard", strings.Repeat(`((abc)*|`, 50) + strings.Repeat(`)`, 50)},
}
func BenchmarkCompile(b *testing.B) {
for _, data := range compileBenchData {
b.Run(data.name, func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
if _, err := Compile(data.re); err != nil {
b.Fatal(err)
}
}
})
}
}
func TestDeepEqual(t *testing.T) {
re1 := MustCompile("a.*b.*c.*d")
re2 := MustCompile("a.*b.*c.*d")
if !reflect.DeepEqual(re1, re2) { // has always been true, since Go 1.
t.Errorf("DeepEqual(re1, re2) = false, want true")
}
re1.MatchString("abcdefghijklmn")
if !reflect.DeepEqual(re1, re2) {
t.Errorf("DeepEqual(re1, re2) = false, want true")
}
re2.MatchString("abcdefghijklmn")
if !reflect.DeepEqual(re1, re2) {
t.Errorf("DeepEqual(re1, re2) = false, want true")
}
re2.MatchString(strings.Repeat("abcdefghijklmn", 100))
if !reflect.DeepEqual(re1, re2) {
t.Errorf("DeepEqual(re1, re2) = false, want true")
}
}
var minInputLenTests = []struct {
Regexp string
min int
}{
{``, 0},
{`a`, 1},
{`aa`, 2},
{`(aa)a`, 3},
{`(?:aa)a`, 3},
{`a?a`, 1},
{`(aaa)|(aa)`, 2},
{`(aa)+a`, 3},
{`(aa)*a`, 1},
{`(aa){3,5}`, 6},
{`[a-z]`, 1},
{``, 3},
}
func TestMinInputLen(t *testing.T) {
for _, tt := range minInputLenTests {
re, _ := syntax.Parse(tt.Regexp, syntax.Perl)
m := minInputLen(re)
if m != tt.min {
t.Errorf("regexp %#q has minInputLen %d, should be %d", tt.Regexp, m, tt.min)
}
}
}
func TestUnmarshalText(t *testing.T) {
unmarshaled := new(Regexp)
for i := range goodRe {
re := compileTest(t, goodRe[i], "")
marshaled, err := re.MarshalText()
if err != nil {
t.Errorf("regexp %#q failed to marshal: %s", re, err)
continue
}
if err := unmarshaled.UnmarshalText(marshaled); err != nil {
t.Errorf("regexp %#q failed to unmarshal: %s", re, err)
continue
}
if unmarshaled.String() != goodRe[i] {
t.Errorf("UnmarshalText returned unexpected value: %s", unmarshaled.String())
}
}
t.Run("invalid pattern", func(t *testing.T) {
re := new(Regexp)
err := re.UnmarshalText([]byte(`\`))
if err == nil {
t.Error("unexpected success")
}
})
}

365
src/regexp/backtrack.go Normal file
View File

@@ -0,0 +1,365 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// backtrack is a regular expression search with submatch
// tracking for small regular expressions and texts. It allocates
// a bit vector with (length of input) * (length of prog) bits,
// to make sure it never explores the same (character position, instruction)
// state multiple times. This limits the search to run in time linear in
// the length of the test.
//
// backtrack is a fast replacement for the NFA code on small
// regexps when onepass cannot be used.
package regexp
import (
"regexp/syntax"
"sync"
)
// A job is an entry on the backtracker's job stack. It holds
// the instruction pc and the position in the input.
type job struct {
pc uint32
arg bool
pos int
}
const (
visitedBits = 32
maxBacktrackProg = 500 // len(prog.Inst) <= max
maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
)
// bitState holds state for the backtracker.
type bitState struct {
end int
cap []int
matchcap []int
jobs []job
visited []uint32
inputs inputs
}
var bitStatePool sync.Pool
func newBitState() *bitState {
b, ok := bitStatePool.Get().(*bitState)
if !ok {
b = new(bitState)
}
return b
}
func freeBitState(b *bitState) {
b.inputs.clear()
bitStatePool.Put(b)
}
// maxBitStateLen returns the maximum length of a string to search with
// the backtracker using prog.
func maxBitStateLen(prog *syntax.Prog) int {
if !shouldBacktrack(prog) {
return 0
}
return maxBacktrackVector / len(prog.Inst)
}
// shouldBacktrack reports whether the program is too
// long for the backtracker to run.
func shouldBacktrack(prog *syntax.Prog) bool {
return len(prog.Inst) <= maxBacktrackProg
}
// reset resets the state of the backtracker.
// end is the end position in the input.
// ncap is the number of captures.
func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
b.end = end
if cap(b.jobs) == 0 {
b.jobs = make([]job, 0, 256)
} else {
b.jobs = b.jobs[:0]
}
visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
if cap(b.visited) < visitedSize {
b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
} else {
b.visited = b.visited[:visitedSize]
clear(b.visited) // set to 0
}
if cap(b.cap) < ncap {
b.cap = make([]int, ncap)
} else {
b.cap = b.cap[:ncap]
}
for i := range b.cap {
b.cap[i] = -1
}
if cap(b.matchcap) < ncap {
b.matchcap = make([]int, ncap)
} else {
b.matchcap = b.matchcap[:ncap]
}
for i := range b.matchcap {
b.matchcap[i] = -1
}
}
// shouldVisit reports whether the combination of (pc, pos) has not
// been visited yet.
func (b *bitState) shouldVisit(pc uint32, pos int) bool {
n := uint(int(pc)*(b.end+1) + pos)
if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
return false
}
b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
return true
}
// push pushes (pc, pos, arg) onto the job stack if it should be
// visited.
func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
// Only check shouldVisit when arg is false.
// When arg is true, we are continuing a previous visit.
if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
}
}
// tryBacktrack runs a backtracking search starting at pos.
func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
longest := re.longest
b.push(re, pc, pos, false)
for len(b.jobs) > 0 {
l := len(b.jobs) - 1
// Pop job off the stack.
pc := b.jobs[l].pc
pos := b.jobs[l].pos
arg := b.jobs[l].arg
b.jobs = b.jobs[:l]
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
goto Skip
CheckAndLoop:
if !b.shouldVisit(pc, pos) {
continue
}
Skip:
inst := &re.prog.Inst[pc]
switch inst.Op {
default:
panic("bad inst")
case syntax.InstFail:
panic("unexpected InstFail")
case syntax.InstAlt:
// Cannot just
// b.push(inst.Out, pos, false)
// b.push(inst.Arg, pos, false)
// If during the processing of inst.Out, we encounter
// inst.Arg via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// inst with arg==true as a reminder to push inst.Arg out
// later.
if arg {
// Finished inst.Out; try inst.Arg.
arg = false
pc = inst.Arg
goto CheckAndLoop
} else {
b.push(re, pc, pos, true)
pc = inst.Out
goto CheckAndLoop
}
case syntax.InstAltMatch:
// One opcode consumes runes; the other leads to match.
switch re.prog.Inst[inst.Out].Op {
case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
// inst.Arg is the match.
b.push(re, inst.Arg, pos, false)
pc = inst.Arg
pos = b.end
goto CheckAndLoop
}
// inst.Out is the match - non-greedy
b.push(re, inst.Out, b.end, false)
pc = inst.Out
goto CheckAndLoop
case syntax.InstRune:
r, width := i.step(pos)
if !inst.MatchRune(r) {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstRune1:
r, width := i.step(pos)
if r != inst.Rune[0] {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstRuneAnyNotNL:
r, width := i.step(pos)
if r == '\n' || r == endOfText {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstRuneAny:
r, width := i.step(pos)
if r == endOfText {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstCapture:
if arg {
// Finished inst.Out; restore the old value.
b.cap[inst.Arg] = pos
continue
} else {
if inst.Arg < uint32(len(b.cap)) {
// Capture pos to register, but save old value.
b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
b.cap[inst.Arg] = pos
}
pc = inst.Out
goto CheckAndLoop
}
case syntax.InstEmptyWidth:
flag := i.context(pos)
if !flag.match(syntax.EmptyOp(inst.Arg)) {
continue
}
pc = inst.Out
goto CheckAndLoop
case syntax.InstNop:
pc = inst.Out
goto CheckAndLoop
case syntax.InstMatch:
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if len(b.cap) == 0 {
return true
}
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
if len(b.cap) > 1 {
b.cap[1] = pos
}
if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
copy(b.matchcap, b.cap)
}
// If going for first match, we're done.
if !longest {
return true
}
// If we used the entire text, no longer match is possible.
if pos == b.end {
return true
}
// Otherwise, continue on in hope of a longer match.
continue
}
}
return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
}
// backtrack runs a backtracking search of prog on the input starting at pos.
func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
startCond := re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return nil
}
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
// Anchored match, past beginning of text.
return nil
}
b := newBitState()
i, end := b.inputs.init(nil, ib, is)
b.reset(re.prog, end, ncap)
// Anchored search must start at the beginning of the input
if startCond&syntax.EmptyBeginText != 0 {
if len(b.cap) > 0 {
b.cap[0] = pos
}
if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
freeBitState(b)
return nil
}
} else {
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is pos <= end, not pos < end.
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
width := -1
for ; pos <= end && width != 0; pos += width {
if len(re.prefix) > 0 {
// Match requires literal prefix; fast search for it.
advance := i.index(re, pos)
if advance < 0 {
freeBitState(b)
return nil
}
pos += advance
}
if len(b.cap) > 0 {
b.cap[0] = pos
}
if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
// Match must be leftmost; done.
goto Match
}
_, width = i.step(pos)
}
freeBitState(b)
return nil
}
Match:
dstCap = append(dstCap, b.matchcap...)
freeBitState(b)
return dstCap
}

447
src/regexp/example_test.go Normal file
View File

@@ -0,0 +1,447 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp_test
import (
"fmt"
"regexp"
"strings"
)
func Example() {
// Compile the expression once, usually at init time.
// Use raw strings to avoid having to quote the backslashes.
var validID = regexp.MustCompile(`^[a-z]+\[[0-9]+\]$`)
fmt.Println(validID.MatchString("adam[23]"))
fmt.Println(validID.MatchString("eve[7]"))
fmt.Println(validID.MatchString("Job[48]"))
fmt.Println(validID.MatchString("snakey"))
// Output:
// true
// true
// false
// false
}
func ExampleMatch() {
matched, err := regexp.Match(`foo.*`, []byte(`seafood`))
fmt.Println(matched, err)
matched, err = regexp.Match(`bar.*`, []byte(`seafood`))
fmt.Println(matched, err)
matched, err = regexp.Match(`a(b`, []byte(`seafood`))
fmt.Println(matched, err)
// Output:
// true <nil>
// false <nil>
// false error parsing regexp: missing closing ): `a(b`
}
func ExampleMatchString() {
matched, err := regexp.MatchString(`foo.*`, "seafood")
fmt.Println(matched, err)
matched, err = regexp.MatchString(`bar.*`, "seafood")
fmt.Println(matched, err)
matched, err = regexp.MatchString(`a(b`, "seafood")
fmt.Println(matched, err)
// Output:
// true <nil>
// false <nil>
// false error parsing regexp: missing closing ): `a(b`
}
func ExampleQuoteMeta() {
fmt.Println(regexp.QuoteMeta(`Escaping symbols like: .+*?()|[]{}^$`))
// Output:
// Escaping symbols like: \.\+\*\?\(\)\|\[\]\{\}\^\$
}
func ExampleRegexp_Find() {
re := regexp.MustCompile(`foo.?`)
fmt.Printf("%q\n", re.Find([]byte(`seafood fool`)))
// Output:
// "food"
}
func ExampleRegexp_FindAll() {
re := regexp.MustCompile(`foo.?`)
fmt.Printf("%q\n", re.FindAll([]byte(`seafood fool`), -1))
// Output:
// ["food" "fool"]
}
func ExampleRegexp_FindAllSubmatch() {
re := regexp.MustCompile(`foo(.?)`)
fmt.Printf("%q\n", re.FindAllSubmatch([]byte(`seafood fool`), -1))
// Output:
// [["food" "d"] ["fool" "l"]]
}
func ExampleRegexp_FindSubmatch() {
re := regexp.MustCompile(`foo(.?)`)
fmt.Printf("%q\n", re.FindSubmatch([]byte(`seafood fool`)))
// Output:
// ["food" "d"]
}
func ExampleRegexp_Match() {
re := regexp.MustCompile(`foo.?`)
fmt.Println(re.Match([]byte(`seafood fool`)))
fmt.Println(re.Match([]byte(`something else`)))
// Output:
// true
// false
}
func ExampleRegexp_FindString() {
re := regexp.MustCompile(`foo.?`)
fmt.Printf("%q\n", re.FindString("seafood fool"))
fmt.Printf("%q\n", re.FindString("meat"))
// Output:
// "food"
// ""
}
func ExampleRegexp_FindStringIndex() {
re := regexp.MustCompile(`ab?`)
fmt.Println(re.FindStringIndex("tablett"))
fmt.Println(re.FindStringIndex("foo") == nil)
// Output:
// [1 3]
// true
}
func ExampleRegexp_FindStringSubmatch() {
re := regexp.MustCompile(`a(x*)b(y|z)c`)
fmt.Printf("%q\n", re.FindStringSubmatch("-axxxbyc-"))
fmt.Printf("%q\n", re.FindStringSubmatch("-abzc-"))
// Output:
// ["axxxbyc" "xxx" "y"]
// ["abzc" "" "z"]
}
func ExampleRegexp_FindAllString() {
re := regexp.MustCompile(`a.`)
fmt.Println(re.FindAllString("paranormal", -1))
fmt.Println(re.FindAllString("paranormal", 2))
fmt.Println(re.FindAllString("graal", -1))
fmt.Println(re.FindAllString("none", -1))
// Output:
// [ar an al]
// [ar an]
// [aa]
// []
}
func ExampleRegexp_FindAllStringSubmatch() {
re := regexp.MustCompile(`a(x*)b`)
fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-", -1))
fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-", -1))
fmt.Printf("%q\n", re.FindAllStringSubmatch("-ab-axb-", -1))
fmt.Printf("%q\n", re.FindAllStringSubmatch("-axxb-ab-", -1))
// Output:
// [["ab" ""]]
// [["axxb" "xx"]]
// [["ab" ""] ["axb" "x"]]
// [["axxb" "xx"] ["ab" ""]]
}
func ExampleRegexp_FindAllStringSubmatchIndex() {
re := regexp.MustCompile(`a(x*)b`)
// Indices:
// 01234567 012345678
// -ab-axb- -axxb-ab-
fmt.Println(re.FindAllStringSubmatchIndex("-ab-", -1))
fmt.Println(re.FindAllStringSubmatchIndex("-axxb-", -1))
fmt.Println(re.FindAllStringSubmatchIndex("-ab-axb-", -1))
fmt.Println(re.FindAllStringSubmatchIndex("-axxb-ab-", -1))
fmt.Println(re.FindAllStringSubmatchIndex("-foo-", -1))
// Output:
// [[1 3 2 2]]
// [[1 5 2 4]]
// [[1 3 2 2] [4 7 5 6]]
// [[1 5 2 4] [6 8 7 7]]
// []
}
func ExampleRegexp_FindSubmatchIndex() {
re := regexp.MustCompile(`a(x*)b`)
// Indices:
// 01234567 012345678
// -ab-axb- -axxb-ab-
fmt.Println(re.FindSubmatchIndex([]byte("-ab-")))
fmt.Println(re.FindSubmatchIndex([]byte("-axxb-")))
fmt.Println(re.FindSubmatchIndex([]byte("-ab-axb-")))
fmt.Println(re.FindSubmatchIndex([]byte("-axxb-ab-")))
fmt.Println(re.FindSubmatchIndex([]byte("-foo-")))
// Output:
// [1 3 2 2]
// [1 5 2 4]
// [1 3 2 2]
// [1 5 2 4]
// []
}
func ExampleRegexp_Longest() {
re := regexp.MustCompile(`a(|b)`)
fmt.Println(re.FindString("ab"))
re.Longest()
fmt.Println(re.FindString("ab"))
// Output:
// a
// ab
}
func ExampleRegexp_MatchString() {
re := regexp.MustCompile(`(gopher){2}`)
fmt.Println(re.MatchString("gopher"))
fmt.Println(re.MatchString("gophergopher"))
fmt.Println(re.MatchString("gophergophergopher"))
// Output:
// false
// true
// true
}
func ExampleRegexp_NumSubexp() {
re0 := regexp.MustCompile(`a.`)
fmt.Printf("%d\n", re0.NumSubexp())
re := regexp.MustCompile(`(.*)((a)b)(.*)a`)
fmt.Println(re.NumSubexp())
// Output:
// 0
// 4
}
func ExampleRegexp_ReplaceAll() {
re := regexp.MustCompile(`a(x*)b`)
fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("T")))
fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("$1")))
fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("$1W")))
fmt.Printf("%s\n", re.ReplaceAll([]byte("-ab-axxb-"), []byte("${1}W")))
re2 := regexp.MustCompile(`a(?P<1W>x*)b`)
fmt.Printf("%s\n", re2.ReplaceAll([]byte("-ab-axxb-"), []byte("$1W")))
fmt.Printf("%s\n", re2.ReplaceAll([]byte("-ab-axxb-"), []byte("${1}W")))
// Output:
// -T-T-
// --xx-
// ---
// -W-xxW-
// --xx-
// -W-xxW-
}
func ExampleRegexp_ReplaceAllLiteralString() {
re := regexp.MustCompile(`a(x*)b`)
fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "T"))
fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "$1"))
fmt.Println(re.ReplaceAllLiteralString("-ab-axxb-", "${1}"))
// Output:
// -T-T-
// -$1-$1-
// -${1}-${1}-
}
func ExampleRegexp_ReplaceAllString() {
re := regexp.MustCompile(`a(x*)b`)
fmt.Println(re.ReplaceAllString("-ab-axxb-", "T"))
fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1"))
fmt.Println(re.ReplaceAllString("-ab-axxb-", "$1W"))
fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W"))
re2 := regexp.MustCompile(`a(?P<1W>x*)b`)
fmt.Printf("%s\n", re2.ReplaceAllString("-ab-axxb-", "$1W"))
fmt.Println(re.ReplaceAllString("-ab-axxb-", "${1}W"))
// Output:
// -T-T-
// --xx-
// ---
// -W-xxW-
// --xx-
// -W-xxW-
}
func ExampleRegexp_ReplaceAllStringFunc() {
re := regexp.MustCompile(`[^aeiou]`)
fmt.Println(re.ReplaceAllStringFunc("seafood fool", strings.ToUpper))
// Output:
// SeaFooD FooL
}
func ExampleRegexp_SubexpNames() {
re := regexp.MustCompile(`(?P<first>[a-zA-Z]+) (?P<last>[a-zA-Z]+)`)
fmt.Println(re.MatchString("Alan Turing"))
fmt.Printf("%q\n", re.SubexpNames())
reversed := fmt.Sprintf("${%s} ${%s}", re.SubexpNames()[2], re.SubexpNames()[1])
fmt.Println(reversed)
fmt.Println(re.ReplaceAllString("Alan Turing", reversed))
// Output:
// true
// ["" "first" "last"]
// ${last} ${first}
// Turing Alan
}
func ExampleRegexp_SubexpIndex() {
re := regexp.MustCompile(`(?P<first>[a-zA-Z]+) (?P<last>[a-zA-Z]+)`)
fmt.Println(re.MatchString("Alan Turing"))
matches := re.FindStringSubmatch("Alan Turing")
lastIndex := re.SubexpIndex("last")
fmt.Printf("last => %d\n", lastIndex)
fmt.Println(matches[lastIndex])
// Output:
// true
// last => 2
// Turing
}
func ExampleRegexp_Split() {
a := regexp.MustCompile(`a`)
fmt.Println(a.Split("banana", -1))
fmt.Println(a.Split("banana", 0))
fmt.Println(a.Split("banana", 1))
fmt.Println(a.Split("banana", 2))
zp := regexp.MustCompile(`z+`)
fmt.Println(zp.Split("pizza", -1))
fmt.Println(zp.Split("pizza", 0))
fmt.Println(zp.Split("pizza", 1))
fmt.Println(zp.Split("pizza", 2))
// Output:
// [b n n ]
// []
// [banana]
// [b nana]
// [pi a]
// []
// [pizza]
// [pi a]
}
func ExampleRegexp_Expand() {
content := []byte(`
# comment line
option1: value1
option2: value2
# another comment line
option3: value3
`)
// Regex pattern captures "key: value" pair from the content.
pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
// Template to convert "key: value" to "key=value" by
// referencing the values captured by the regex pattern.
template := []byte("$key=$value\n")
result := []byte{}
// For each match of the regex in the content.
for _, submatches := range pattern.FindAllSubmatchIndex(content, -1) {
// Apply the captured submatches to the template and append the output
// to the result.
result = pattern.Expand(result, template, content, submatches)
}
fmt.Println(string(result))
// Output:
// option1=value1
// option2=value2
// option3=value3
}
func ExampleRegexp_ExpandString() {
content := `
# comment line
option1: value1
option2: value2
# another comment line
option3: value3
`
// Regex pattern captures "key: value" pair from the content.
pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
// Template to convert "key: value" to "key=value" by
// referencing the values captured by the regex pattern.
template := "$key=$value\n"
result := []byte{}
// For each match of the regex in the content.
for _, submatches := range pattern.FindAllStringSubmatchIndex(content, -1) {
// Apply the captured submatches to the template and append the output
// to the result.
result = pattern.ExpandString(result, template, content, submatches)
}
fmt.Println(string(result))
// Output:
// option1=value1
// option2=value2
// option3=value3
}
func ExampleRegexp_FindIndex() {
content := []byte(`
# comment line
option1: value1
option2: value2
`)
// Regex pattern captures "key: value" pair from the content.
pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
loc := pattern.FindIndex(content)
fmt.Println(loc)
fmt.Println(string(content[loc[0]:loc[1]]))
// Output:
// [18 33]
// option1: value1
}
func ExampleRegexp_FindAllSubmatchIndex() {
content := []byte(`
# comment line
option1: value1
option2: value2
`)
// Regex pattern captures "key: value" pair from the content.
pattern := regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
allIndexes := pattern.FindAllSubmatchIndex(content, -1)
for _, loc := range allIndexes {
fmt.Println(loc)
fmt.Println(string(content[loc[0]:loc[1]]))
fmt.Println(string(content[loc[2]:loc[3]]))
fmt.Println(string(content[loc[4]:loc[5]]))
}
// Output:
// [18 33 18 25 27 33]
// option1: value1
// option1
// value1
// [35 50 35 42 44 50]
// option2: value2
// option2
// value2
}
func ExampleRegexp_FindAllIndex() {
content := []byte("London")
re := regexp.MustCompile(`o.`)
fmt.Println(re.FindAllIndex(content, 1))
fmt.Println(re.FindAllIndex(content, -1))
// Output:
// [[1 3]]
// [[1 3] [4 6]]
}

554
src/regexp/exec.go Normal file
View File

@@ -0,0 +1,554 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"io"
"regexp/syntax"
"sync"
)
// A queue is a 'sparse array' holding pending threads of execution.
// See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
type queue struct {
sparse []uint32
dense []entry
}
// An entry is an entry on a queue.
// It holds both the instruction pc and the actual thread.
// Some queue entries are just place holders so that the machine
// knows it has considered that pc. Such entries have t == nil.
type entry struct {
pc uint32
t *thread
}
// A thread is the state of a single path through the machine:
// an instruction and a corresponding capture array.
// See https://swtch.com/~rsc/regexp/regexp2.html
type thread struct {
inst *syntax.Inst
cap []int
}
// A machine holds all the state during an NFA simulation for p.
type machine struct {
re *Regexp // corresponding Regexp
p *syntax.Prog // compiled program
q0, q1 queue // two queues for runq, nextq
pool []*thread // pool of available threads
matched bool // whether a match was found
matchcap []int // capture information for the match
inputs inputs
}
type inputs struct {
// cached inputs, to avoid allocation
bytes inputBytes
string inputString
reader inputReader
}
func (i *inputs) newBytes(b []byte) input {
i.bytes.str = b
return &i.bytes
}
func (i *inputs) newString(s string) input {
i.string.str = s
return &i.string
}
func (i *inputs) newReader(r io.RuneReader) input {
i.reader.r = r
i.reader.atEOT = false
i.reader.pos = 0
return &i.reader
}
func (i *inputs) clear() {
// We need to clear 1 of these.
// Avoid the expense of clearing the others (pointer write barrier).
if i.bytes.str != nil {
i.bytes.str = nil
} else if i.reader.r != nil {
i.reader.r = nil
} else {
i.string.str = ""
}
}
func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) {
if r != nil {
return i.newReader(r), 0
}
if b != nil {
return i.newBytes(b), len(b)
}
return i.newString(s), len(s)
}
func (m *machine) init(ncap int) {
for _, t := range m.pool {
t.cap = t.cap[:ncap]
}
m.matchcap = m.matchcap[:ncap]
}
// alloc allocates a new thread with the given instruction.
// It uses the free pool if possible.
func (m *machine) alloc(i *syntax.Inst) *thread {
var t *thread
if n := len(m.pool); n > 0 {
t = m.pool[n-1]
m.pool = m.pool[:n-1]
} else {
t = new(thread)
t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
}
t.inst = i
return t
}
// A lazyFlag is a lazily-evaluated syntax.EmptyOp,
// for checking zero-width flags like ^ $ \A \z \B \b.
// It records the pair of relevant runes and does not
// determine the implied flags until absolutely necessary
// (most of the time, that means never).
type lazyFlag uint64
func newLazyFlag(r1, r2 rune) lazyFlag {
return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2)))
}
func (f lazyFlag) match(op syntax.EmptyOp) bool {
if op == 0 {
return true
}
r1 := rune(f >> 32)
if op&syntax.EmptyBeginLine != 0 {
if r1 != '\n' && r1 >= 0 {
return false
}
op &^= syntax.EmptyBeginLine
}
if op&syntax.EmptyBeginText != 0 {
if r1 >= 0 {
return false
}
op &^= syntax.EmptyBeginText
}
if op == 0 {
return true
}
r2 := rune(f)
if op&syntax.EmptyEndLine != 0 {
if r2 != '\n' && r2 >= 0 {
return false
}
op &^= syntax.EmptyEndLine
}
if op&syntax.EmptyEndText != 0 {
if r2 >= 0 {
return false
}
op &^= syntax.EmptyEndText
}
if op == 0 {
return true
}
if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) {
op &^= syntax.EmptyWordBoundary
} else {
op &^= syntax.EmptyNoWordBoundary
}
return op == 0
}
// match runs the machine over the input starting at pos.
// It reports whether a match was found.
// If so, m.matchcap holds the submatch information.
func (m *machine) match(i input, pos int) bool {
startCond := m.re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return false
}
m.matched = false
for i := range m.matchcap {
m.matchcap[i] = -1
}
runq, nextq := &m.q0, &m.q1
r, r1 := endOfText, endOfText
width, width1 := 0, 0
r, width = i.step(pos)
if r != endOfText {
r1, width1 = i.step(pos + width)
}
var flag lazyFlag
if pos == 0 {
flag = newLazyFlag(-1, r)
} else {
flag = i.context(pos)
}
for {
if len(runq.dense) == 0 {
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
// Anchored match, past beginning of text.
break
}
if m.matched {
// Have match; finished exploring alternatives.
break
}
if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
advance := i.index(m.re, pos)
if advance < 0 {
break
}
pos += advance
r, width = i.step(pos)
r1, width1 = i.step(pos + width)
}
}
if !m.matched {
if len(m.matchcap) > 0 {
m.matchcap[0] = pos
}
m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil)
}
flag = newLazyFlag(r, r1)
m.step(runq, nextq, pos, pos+width, r, &flag)
if width == 0 {
break
}
if len(m.matchcap) == 0 && m.matched {
// Found a match and not paying attention
// to where it is, so any match will do.
break
}
pos += width
r, width = r1, width1
if r != endOfText {
r1, width1 = i.step(pos + width)
}
runq, nextq = nextq, runq
}
m.clear(nextq)
return m.matched
}
// clear frees all threads on the thread queue.
func (m *machine) clear(q *queue) {
for _, d := range q.dense {
if d.t != nil {
m.pool = append(m.pool, d.t)
}
}
q.dense = q.dense[:0]
}
// step executes one step of the machine, running each of the threads
// on runq and appending new threads to nextq.
// The step processes the rune c (which may be endOfText),
// which starts at position pos and ends at nextPos.
// nextCond gives the setting for the empty-width flags after c.
func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) {
longest := m.re.longest
for j := 0; j < len(runq.dense); j++ {
d := &runq.dense[j]
t := d.t
if t == nil {
continue
}
if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
m.pool = append(m.pool, t)
continue
}
i := t.inst
add := false
switch i.Op {
default:
panic("bad inst")
case syntax.InstMatch:
if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
t.cap[1] = pos
copy(m.matchcap, t.cap)
}
if !longest {
// First-match mode: cut off all lower-priority threads.
for _, d := range runq.dense[j+1:] {
if d.t != nil {
m.pool = append(m.pool, d.t)
}
}
runq.dense = runq.dense[:0]
}
m.matched = true
case syntax.InstRune:
add = i.MatchRune(c)
case syntax.InstRune1:
add = c == i.Rune[0]
case syntax.InstRuneAny:
add = true
case syntax.InstRuneAnyNotNL:
add = c != '\n'
}
if add {
t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
}
if t != nil {
m.pool = append(m.pool, t)
}
}
runq.dense = runq.dense[:0]
}
// add adds an entry to q for pc, unless the q already has such an entry.
// It also recursively adds an entry for all instructions reachable from pc by following
// empty-width conditions satisfied by cond. pos gives the current position
// in the input.
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread {
Again:
if pc == 0 {
return t
}
if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
return t
}
j := len(q.dense)
q.dense = q.dense[:j+1]
d := &q.dense[j]
d.t = nil
d.pc = pc
q.sparse[pc] = uint32(j)
i := &m.p.Inst[pc]
switch i.Op {
default:
panic("unhandled")
case syntax.InstFail:
// nothing
case syntax.InstAlt, syntax.InstAltMatch:
t = m.add(q, i.Out, pos, cap, cond, t)
pc = i.Arg
goto Again
case syntax.InstEmptyWidth:
if cond.match(syntax.EmptyOp(i.Arg)) {
pc = i.Out
goto Again
}
case syntax.InstNop:
pc = i.Out
goto Again
case syntax.InstCapture:
if int(i.Arg) < len(cap) {
opos := cap[i.Arg]
cap[i.Arg] = pos
m.add(q, i.Out, pos, cap, cond, nil)
cap[i.Arg] = opos
} else {
pc = i.Out
goto Again
}
case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
if t == nil {
t = m.alloc(i)
} else {
t.inst = i
}
if len(cap) > 0 && &t.cap[0] != &cap[0] {
copy(t.cap, cap)
}
d.t = t
t = nil
}
return t
}
type onePassMachine struct {
inputs inputs
matchcap []int
}
var onePassPool sync.Pool
func newOnePassMachine() *onePassMachine {
m, ok := onePassPool.Get().(*onePassMachine)
if !ok {
m = new(onePassMachine)
}
return m
}
func freeOnePassMachine(m *onePassMachine) {
m.inputs.clear()
onePassPool.Put(m)
}
// doOnePass implements r.doExecute using the one-pass execution engine.
func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int {
startCond := re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return nil
}
m := newOnePassMachine()
if cap(m.matchcap) < ncap {
m.matchcap = make([]int, ncap)
} else {
m.matchcap = m.matchcap[:ncap]
}
matched := false
for i := range m.matchcap {
m.matchcap[i] = -1
}
i, _ := m.inputs.init(ir, ib, is)
r, r1 := endOfText, endOfText
width, width1 := 0, 0
r, width = i.step(pos)
if r != endOfText {
r1, width1 = i.step(pos + width)
}
var flag lazyFlag
if pos == 0 {
flag = newLazyFlag(-1, r)
} else {
flag = i.context(pos)
}
pc := re.onepass.Start
inst := &re.onepass.Inst[pc]
// If there is a simple literal prefix, skip over it.
if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) &&
len(re.prefix) > 0 && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
if !i.hasPrefix(re) {
goto Return
}
pos += len(re.prefix)
r, width = i.step(pos)
r1, width1 = i.step(pos + width)
flag = i.context(pos)
pc = int(re.prefixEnd)
}
for {
inst = &re.onepass.Inst[pc]
pc = int(inst.Out)
switch inst.Op {
default:
panic("bad inst")
case syntax.InstMatch:
matched = true
if len(m.matchcap) > 0 {
m.matchcap[0] = 0
m.matchcap[1] = pos
}
goto Return
case syntax.InstRune:
if !inst.MatchRune(r) {
goto Return
}
case syntax.InstRune1:
if r != inst.Rune[0] {
goto Return
}
case syntax.InstRuneAny:
// Nothing
case syntax.InstRuneAnyNotNL:
if r == '\n' {
goto Return
}
// peek at the input rune to see which branch of the Alt to take
case syntax.InstAlt, syntax.InstAltMatch:
pc = int(onePassNext(inst, r))
continue
case syntax.InstFail:
goto Return
case syntax.InstNop:
continue
case syntax.InstEmptyWidth:
if !flag.match(syntax.EmptyOp(inst.Arg)) {
goto Return
}
continue
case syntax.InstCapture:
if int(inst.Arg) < len(m.matchcap) {
m.matchcap[inst.Arg] = pos
}
continue
}
if width == 0 {
break
}
flag = newLazyFlag(r, r1)
pos += width
r, width = r1, width1
if r != endOfText {
r1, width1 = i.step(pos + width)
}
}
Return:
if !matched {
freeOnePassMachine(m)
return nil
}
dstCap = append(dstCap, m.matchcap...)
freeOnePassMachine(m)
return dstCap
}
// doMatch reports whether either r, b or s match the regexp.
func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool {
return re.doExecute(r, b, s, 0, 0, nil) != nil
}
// doExecute finds the leftmost match in the input, appends the position
// of its subexpressions to dstCap and returns dstCap.
//
// nil is returned if no matches are found and non-nil if matches are found.
func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
if dstCap == nil {
// Make sure 'return dstCap' is non-nil.
dstCap = arrayNoInts[:0:0]
}
if r == nil && len(b)+len(s) < re.minInputLen {
return nil
}
if re.onepass != nil {
return re.doOnePass(r, b, s, pos, ncap, dstCap)
}
if r == nil && len(b)+len(s) < re.maxBitStateLen {
return re.backtrack(b, s, pos, ncap, dstCap)
}
m := re.get()
i, _ := m.inputs.init(r, b, s)
m.init(ncap)
if !m.match(i, pos) {
re.put(m)
return nil
}
dstCap = append(dstCap, m.matchcap...)
re.put(m)
return dstCap
}
// arrayNoInts is returned by doExecute match if nil dstCap is passed
// to it with ncap=0.
var arrayNoInts [0]int

20
src/regexp/exec2_test.go Normal file
View File

@@ -0,0 +1,20 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !race
package regexp
import (
"testing"
)
// This test is excluded when running under the race detector because
// it is a very expensive test and takes too long.
func TestRE2Exhaustive(t *testing.T) {
if testing.Short() {
t.Skip("skipping TestRE2Exhaustive during short test")
}
testRE2(t, "testdata/re2-exhaustive.txt.bz2")
}

736
src/regexp/exec_test.go Normal file
View File

@@ -0,0 +1,736 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"bufio"
"compress/bzip2"
"fmt"
"internal/testenv"
"io"
"os"
"path/filepath"
"regexp/syntax"
"slices"
"strconv"
"strings"
"testing"
"unicode/utf8"
)
// TestRE2 tests this package's regexp API against test cases
// considered during RE2's exhaustive tests, which run all possible
// regexps over a given set of atoms and operators, up to a given
// complexity, over all possible strings over a given alphabet,
// up to a given size. Rather than try to link with RE2, we read a
// log file containing the test cases and the expected matches.
// The log file, re2-exhaustive.txt, is generated by running 'make log'
// in the open source RE2 distribution https://github.com/google/re2/.
//
// The test file format is a sequence of stanzas like:
//
// strings
// "abc"
// "123x"
// regexps
// "[a-z]+"
// 0-3;0-3
// -;-
// "([0-9])([0-9])([0-9])"
// -;-
// -;0-3 0-1 1-2 2-3
//
// The stanza begins by defining a set of strings, quoted
// using Go double-quote syntax, one per line. Then the
// regexps section gives a sequence of regexps to run on
// the strings. In the block that follows a regexp, each line
// gives the semicolon-separated match results of running
// the regexp on the corresponding string.
// Each match result is either a single -, meaning no match, or a
// space-separated sequence of pairs giving the match and
// submatch indices. An unmatched subexpression formats
// its pair as a single - (not illustrated above). For now
// each regexp run produces two match results, one for a
// “full match” that restricts the regexp to matching the entire
// string or nothing, and one for a “partial match” that gives
// the leftmost first match found in the string.
//
// Lines beginning with # are comments. Lines beginning with
// a capital letter are test names printed during RE2's test suite
// and are echoed into t but otherwise ignored.
//
// At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB,
// so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly.
func TestRE2Search(t *testing.T) {
testRE2(t, "testdata/re2-search.txt")
}
func testRE2(t *testing.T, file string) {
f, err := os.Open(file)
if err != nil {
t.Fatal(err)
}
defer f.Close()
var txt io.Reader
if strings.HasSuffix(file, ".bz2") {
z := bzip2.NewReader(f)
txt = z
file = file[:len(file)-len(".bz2")] // for error messages
} else {
txt = f
}
lineno := 0
scanner := bufio.NewScanner(txt)
var (
str []string
input []string
inStrings bool
re *Regexp
refull *Regexp
nfail int
ncase int
)
for lineno := 1; scanner.Scan(); lineno++ {
line := scanner.Text()
switch {
case line == "":
t.Fatalf("%s:%d: unexpected blank line", file, lineno)
case line[0] == '#':
continue
case 'A' <= line[0] && line[0] <= 'Z':
// Test name.
t.Logf("%s\n", line)
continue
case line == "strings":
str = str[:0]
inStrings = true
case line == "regexps":
inStrings = false
case line[0] == '"':
q, err := strconv.Unquote(line)
if err != nil {
// Fatal because we'll get out of sync.
t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
}
if inStrings {
str = append(str, q)
continue
}
// Is a regexp.
if len(input) != 0 {
t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
}
re, err = tryCompile(q)
if err != nil {
if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
// We don't and likely never will support \C; keep going.
continue
}
t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
full := `\A(?:` + q + `)\z`
refull, err = tryCompile(full)
if err != nil {
// Fatal because q worked, so this should always work.
t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
}
input = str
case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
// A sequence of match results.
ncase++
if re == nil {
// Failed to compile: skip results.
continue
}
if len(input) == 0 {
t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
}
var text string
text, input = input[0], input[1:]
if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
// RE2's \B considers every byte position,
// so it sees 'not word boundary' in the
// middle of UTF-8 sequences. This package
// only considers the positions between runes,
// so it disagrees. Skip those cases.
continue
}
res := strings.Split(line, ";")
if len(res) != len(run) {
t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
}
for i := range res {
have, suffix := run[i](re, refull, text)
want := parseResult(t, file, lineno, res[i])
if !slices.Equal(have, want) {
t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
b, suffix := match[i](re, refull, text)
if b != (want != nil) {
t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
if nfail++; nfail >= 100 {
t.Fatalf("stopping after %d errors", nfail)
}
continue
}
}
default:
t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
}
}
if err := scanner.Err(); err != nil {
t.Fatalf("%s:%d: %v", file, lineno, err)
}
if len(input) != 0 {
t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
}
t.Logf("%d cases tested", ncase)
}
var run = []func(*Regexp, *Regexp, string) ([]int, string){
runFull,
runPartial,
runFullLongest,
runPartialLongest,
}
func runFull(re, refull *Regexp, text string) ([]int, string) {
refull.longest = false
return refull.FindStringSubmatchIndex(text), "[full]"
}
func runPartial(re, refull *Regexp, text string) ([]int, string) {
re.longest = false
return re.FindStringSubmatchIndex(text), ""
}
func runFullLongest(re, refull *Regexp, text string) ([]int, string) {
refull.longest = true
return refull.FindStringSubmatchIndex(text), "[full,longest]"
}
func runPartialLongest(re, refull *Regexp, text string) ([]int, string) {
re.longest = true
return re.FindStringSubmatchIndex(text), "[longest]"
}
var match = []func(*Regexp, *Regexp, string) (bool, string){
matchFull,
matchPartial,
matchFullLongest,
matchPartialLongest,
}
func matchFull(re, refull *Regexp, text string) (bool, string) {
refull.longest = false
return refull.MatchString(text), "[full]"
}
func matchPartial(re, refull *Regexp, text string) (bool, string) {
re.longest = false
return re.MatchString(text), ""
}
func matchFullLongest(re, refull *Regexp, text string) (bool, string) {
refull.longest = true
return refull.MatchString(text), "[full,longest]"
}
func matchPartialLongest(re, refull *Regexp, text string) (bool, string) {
re.longest = true
return re.MatchString(text), "[longest]"
}
func isSingleBytes(s string) bool {
for _, c := range s {
if c >= utf8.RuneSelf {
return false
}
}
return true
}
func tryCompile(s string) (re *Regexp, err error) {
// Protect against panic during Compile.
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic: %v", r)
}
}()
return Compile(s)
}
func parseResult(t *testing.T, file string, lineno int, res string) []int {
// A single - indicates no match.
if res == "-" {
return nil
}
// Otherwise, a space-separated list of pairs.
n := 1
for j := 0; j < len(res); j++ {
if res[j] == ' ' {
n++
}
}
out := make([]int, 2*n)
i := 0
n = 0
for j := 0; j <= len(res); j++ {
if j == len(res) || res[j] == ' ' {
// Process a single pair. - means no submatch.
pair := res[i:j]
if pair == "-" {
out[n] = -1
out[n+1] = -1
} else {
loStr, hiStr, _ := strings.Cut(pair, "-")
lo, err1 := strconv.Atoi(loStr)
hi, err2 := strconv.Atoi(hiStr)
if err1 != nil || err2 != nil || lo > hi {
t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
}
out[n] = lo
out[n+1] = hi
}
n += 2
i = j + 1
}
}
return out
}
// TestFowler runs this package's regexp API against the
// POSIX regular expression tests collected by Glenn Fowler
// at http://www2.research.att.com/~astopen/testregex/testregex.html.
func TestFowler(t *testing.T) {
files, err := filepath.Glob("testdata/*.dat")
if err != nil {
t.Fatal(err)
}
for _, file := range files {
t.Log(file)
testFowler(t, file)
}
}
var notab = MustCompilePOSIX(`[^\t]+`)
func testFowler(t *testing.T, file string) {
f, err := os.Open(file)
if err != nil {
t.Error(err)
return
}
defer f.Close()
b := bufio.NewReader(f)
lineno := 0
lastRegexp := ""
Reading:
for {
lineno++
line, err := b.ReadString('\n')
if err != nil {
if err != io.EOF {
t.Errorf("%s:%d: %v", file, lineno, err)
}
break Reading
}
// http://www2.research.att.com/~astopen/man/man1/testregex.html
//
// INPUT FORMAT
// Input lines may be blank, a comment beginning with #, or a test
// specification. A specification is five fields separated by one
// or more tabs. NULL denotes the empty string and NIL denotes the
// 0 pointer.
if line[0] == '#' || line[0] == '\n' {
continue Reading
}
line = line[:len(line)-1]
field := notab.FindAllString(line, -1)
for i, f := range field {
if f == "NULL" {
field[i] = ""
}
if f == "NIL" {
t.Logf("%s:%d: skip: %s", file, lineno, line)
continue Reading
}
}
if len(field) == 0 {
continue Reading
}
// Field 1: the regex(3) flags to apply, one character per REG_feature
// flag. The test is skipped if REG_feature is not supported by the
// implementation. If the first character is not [BEASKLP] then the
// specification is a global control line. One or more of [BEASKLP] may be
// specified; the test will be repeated for each mode.
//
// B basic BRE (grep, ed, sed)
// E REG_EXTENDED ERE (egrep)
// A REG_AUGMENTED ARE (egrep with negation)
// S REG_SHELL SRE (sh glob)
// K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
// L REG_LITERAL LRE (fgrep)
//
// a REG_LEFT|REG_RIGHT implicit ^...$
// b REG_NOTBOL lhs does not match ^
// c REG_COMMENT ignore space and #...\n
// d REG_SHELL_DOT explicit leading . match
// e REG_NOTEOL rhs does not match $
// f REG_MULTIPLE multiple \n separated patterns
// g FNM_LEADING_DIR testfnmatch only -- match until /
// h REG_MULTIREF multiple digit backref
// i REG_ICASE ignore case
// j REG_SPAN . matches \n
// k REG_ESCAPE \ to escape [...] delimiter
// l REG_LEFT implicit ^...
// m REG_MINIMAL minimal match
// n REG_NEWLINE explicit \n match
// o REG_ENCLOSED (|&) magic inside [@|&](...)
// p REG_SHELL_PATH explicit / match
// q REG_DELIMITED delimited pattern
// r REG_RIGHT implicit ...$
// s REG_SHELL_ESCAPED \ not special
// t REG_MUSTDELIM all delimiters must be specified
// u standard unspecified behavior -- errors not counted
// v REG_CLASS_ESCAPE \ special inside [...]
// w REG_NOSUB no subexpression match array
// x REG_LENIENT let some errors slide
// y REG_LEFT regexec() implicit ^...
// z REG_NULL NULL subexpressions ok
// $ expand C \c escapes in fields 2 and 3
// / field 2 is a regsubcomp() expression
// = field 3 is a regdecomp() expression
//
// Field 1 control lines:
//
// C set LC_COLLATE and LC_CTYPE to locale in field 2
//
// ?test ... output field 5 if passed and != EXPECTED, silent otherwise
// &test ... output field 5 if current and previous passed
// |test ... output field 5 if current passed and previous failed
// ; ... output field 2 if previous failed
// {test ... skip if failed until }
// } end of skip
//
// : comment comment copied as output NOTE
// :comment:test :comment: ignored
// N[OTE] comment comment copied as output NOTE
// T[EST] comment comment
//
// number use number for nmatch (20 by default)
flag := field[0]
switch flag[0] {
case '?', '&', '|', ';', '{', '}':
// Ignore all the control operators.
// Just run everything.
flag = flag[1:]
if flag == "" {
continue Reading
}
case ':':
var ok bool
if _, flag, ok = strings.Cut(flag[1:], ":"); !ok {
t.Logf("skip: %s", line)
continue Reading
}
case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
t.Logf("skip: %s", line)
continue Reading
}
// Can check field count now that we've handled the myriad comment formats.
if len(field) < 4 {
t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
continue Reading
}
// Expand C escapes (a.k.a. Go escapes).
if strings.Contains(flag, "$") {
f := `"` + field[1] + `"`
if field[1], err = strconv.Unquote(f); err != nil {
t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
}
f = `"` + field[2] + `"`
if field[2], err = strconv.Unquote(f); err != nil {
t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
}
}
// Field 2: the regular expression pattern; SAME uses the pattern from
// the previous specification.
//
if field[1] == "SAME" {
field[1] = lastRegexp
}
lastRegexp = field[1]
// Field 3: the string to match.
text := field[2]
// Field 4: the test outcome...
ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
if !ok {
t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
continue Reading
}
// Field 5: optional comment appended to the report.
Testing:
// Run test once for each specified capital letter mode that we support.
for _, c := range flag {
pattern := field[1]
syn := syntax.POSIX | syntax.ClassNL
switch c {
default:
continue Testing
case 'E':
// extended regexp (what we support)
case 'L':
// literal
pattern = QuoteMeta(pattern)
}
for _, c := range flag {
switch c {
case 'i':
syn |= syntax.FoldCase
}
}
re, err := compile(pattern, syn, true)
if err != nil {
if shouldCompile {
t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
}
continue Testing
}
if !shouldCompile {
t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
continue Testing
}
match := re.MatchString(text)
if match != shouldMatch {
t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
continue Testing
}
have := re.FindStringSubmatchIndex(text)
if (len(have) > 0) != match {
t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have)
continue Testing
}
if len(have) > len(pos) {
have = have[:len(pos)]
}
if !slices.Equal(have, pos) {
t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
}
}
}
}
func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
// Field 4: the test outcome. This is either one of the posix error
// codes (with REG_ omitted) or the match array, a list of (m,n)
// entries with m and n being first and last+1 positions in the
// field 3 string, or NULL if REG_NOSUB is in effect and success
// is expected. BADPAT is acceptable in place of any regcomp(3)
// error code. The match[] array is initialized to (-2,-2) before
// each test. All array elements from 0 to nmatch-1 must be specified
// in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
// Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
// matched (?{...}) expression, where x is the text enclosed by {...},
// o is the expression ordinal counting from 1, and n is the length of
// the unmatched portion of the subject string. If x starts with a
// number then that is the return value of re_execf(), otherwise 0 is
// returned.
switch {
case s == "":
// Match with no position information.
ok = true
compiled = true
matched = true
return
case s == "NOMATCH":
// Match failure.
ok = true
compiled = true
matched = false
return
case 'A' <= s[0] && s[0] <= 'Z':
// All the other error codes are compile errors.
ok = true
compiled = false
return
}
compiled = true
var x []int
for s != "" {
var end byte = ')'
if len(x)%2 == 0 {
if s[0] != '(' {
ok = false
return
}
s = s[1:]
end = ','
}
i := 0
for i < len(s) && s[i] != end {
i++
}
if i == 0 || i == len(s) {
ok = false
return
}
var v = -1
var err error
if s[:i] != "?" {
v, err = strconv.Atoi(s[:i])
if err != nil {
ok = false
return
}
}
x = append(x, v)
s = s[i+1:]
}
if len(x)%2 != 0 {
ok = false
return
}
ok = true
matched = true
pos = x
return
}
var text []byte
func makeText(n int) []byte {
if len(text) >= n {
return text[:n]
}
text = make([]byte, n)
x := ^uint32(0)
for i := range text {
x += x
x ^= 1
if int32(x) < 0 {
x ^= 0x88888eef
}
if x%31 == 0 {
text[i] = '\n'
} else {
text[i] = byte(x%(0x7E+1-0x20) + 0x20)
}
}
return text
}
func BenchmarkMatch(b *testing.B) {
isRaceBuilder := strings.HasSuffix(testenv.Builder(), "-race")
for _, data := range benchData {
r := MustCompile(data.re)
for _, size := range benchSizes {
if (isRaceBuilder || testing.Short()) && size.n > 1<<10 {
continue
}
t := makeText(size.n)
b.Run(data.name+"/"+size.name, func(b *testing.B) {
b.SetBytes(int64(size.n))
for i := 0; i < b.N; i++ {
if r.Match(t) {
b.Fatal("match!")
}
}
})
}
}
}
func BenchmarkMatch_onepass_regex(b *testing.B) {
isRaceBuilder := strings.HasSuffix(testenv.Builder(), "-race")
r := MustCompile(`(?s)\A.*\z`)
if r.onepass == nil {
b.Fatalf("want onepass regex, but %q is not onepass", r)
}
for _, size := range benchSizes {
if (isRaceBuilder || testing.Short()) && size.n > 1<<10 {
continue
}
t := makeText(size.n)
b.Run(size.name, func(b *testing.B) {
b.SetBytes(int64(size.n))
b.ReportAllocs()
for i := 0; i < b.N; i++ {
if !r.Match(t) {
b.Fatal("not match!")
}
}
})
}
}
var benchData = []struct{ name, re string }{
{"Easy0", "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
{"Easy0i", "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"},
{"Easy1", "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"},
{"Medium", "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
{"Hard", "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
{"Hard1", "ABCD|CDEF|EFGH|GHIJ|IJKL|KLMN|MNOP|OPQR|QRST|STUV|UVWX|WXYZ"},
}
var benchSizes = []struct {
name string
n int
}{
{"16", 16},
{"32", 32},
{"1K", 1 << 10},
{"32K", 32 << 10},
{"1M", 1 << 20},
{"32M", 32 << 20},
}
func TestLongest(t *testing.T) {
re, err := Compile(`a(|b)`)
if err != nil {
t.Fatal(err)
}
if g, w := re.FindString("ab"), "a"; g != w {
t.Errorf("first match was %q, want %q", g, w)
}
re.Longest()
if g, w := re.FindString("ab"), "ab"; g != w {
t.Errorf("longest match was %q, want %q", g, w)
}
}
// TestProgramTooLongForBacktrack tests that a regex which is too long
// for the backtracker still executes properly.
func TestProgramTooLongForBacktrack(t *testing.T) {
longRegex := MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`)
if !longRegex.MatchString("two") {
t.Errorf("longRegex.MatchString(\"two\") was false, want true")
}
if longRegex.MatchString("xxx") {
t.Errorf("longRegex.MatchString(\"xxx\") was true, want false")
}
}

518
src/regexp/find_test.go Normal file
View File

@@ -0,0 +1,518 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"fmt"
"strings"
"testing"
)
// For each pattern/text pair, what is the expected output of each function?
// We can derive the textual results from the indexed results, the non-submatch
// results from the submatched results, the single results from the 'all' results,
// and the byte results from the string results. Therefore the table includes
// only the FindAllStringSubmatchIndex result.
type FindTest struct {
pat string
text string
matches [][]int
}
func (t FindTest) String() string {
return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text)
}
var findTests = []FindTest{
{``, ``, build(1, 0, 0)},
{`^abcdefg`, "abcdefg", build(1, 0, 7)},
{`a+`, "baaab", build(1, 1, 4)},
{"abcd..", "abcdef", build(1, 0, 6)},
{`a`, "a", build(1, 0, 1)},
{`x`, "y", nil},
{`b`, "abc", build(1, 1, 2)},
{`.`, "a", build(1, 0, 1)},
{`.*`, "abcdef", build(1, 0, 6)},
{`^`, "abcde", build(1, 0, 0)},
{`$`, "abcde", build(1, 5, 5)},
{`^abcd$`, "abcd", build(1, 0, 4)},
{`^bcd'`, "abcdef", nil},
{`^abcd$`, "abcde", nil},
{`a+`, "baaab", build(1, 1, 4)},
{`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)},
{`[a-z]+`, "abcd", build(1, 0, 4)},
{`[^a-z]+`, "ab1234cd", build(1, 2, 6)},
{`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)},
{`[^\n]+`, "abcd\n", build(1, 0, 4)},
{`[日本語]+`, "日本語日本語", build(1, 0, 18)},
{`日本語+`, "日本語", build(1, 0, 9)},
{`日本語+`, "日本語語語語", build(1, 0, 18)},
{`()`, "", build(1, 0, 0, 0, 0)},
{`(a)`, "a", build(1, 0, 1, 0, 1)},
{`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)},
{`(.*)`, "", build(1, 0, 0, 0, 0)},
{`(.*)`, "abcd", build(1, 0, 4, 0, 4)},
{`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)},
{`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)},
{`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)},
{`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)},
{`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)},
{`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)},
{`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)},
{`(.*).*`, "ab", build(1, 0, 2, 0, 2)},
{`[.]`, ".", build(1, 0, 1)},
{`/$`, "/abc/", build(1, 4, 5)},
{`/$`, "/abc", nil},
// multiple matches
{`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)},
{`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)},
{`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)},
{`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)},
{`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)},
// fixed bugs
{`ab$`, "cab", build(1, 1, 3)},
{`axxb$`, "axxcb", nil},
{`data`, "daXY data", build(1, 5, 9)},
{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
{`zx+`, "zzx", build(1, 1, 3)},
{`ab$`, "abcab", build(1, 3, 5)},
{`(aa)*$`, "a", build(1, 1, 1, -1, -1)},
{`(?:.|(?:.a))`, "", nil},
{`(?:A(?:A|a))`, "Aa", build(1, 0, 2)},
{`(?:A|(?:A|a))`, "a", build(1, 0, 1)},
{`(a){0}`, "", build(1, 0, 0, -1, -1)},
{`(?-s)(?:(?:^).)`, "\n", nil},
{`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)},
{`(?:(?:^).)`, "\n", nil},
{`\b`, "x", build(2, 0, 0, 1, 1)},
{`\b`, "xx", build(2, 0, 0, 2, 2)},
{`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)},
{`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)},
{`\B`, "x", nil},
{`\B`, "xx", build(1, 1, 1)},
{`\B`, "x y", nil},
{`\B`, "xx yy", build(2, 1, 1, 4, 4)},
{`(|a)*`, "aa", build(3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2)},
// RE2 tests
{`[^\S\s]`, "abcd", nil},
{`[^\S[:space:]]`, "abcd", nil},
{`[^\D\d]`, "abcd", nil},
{`[^\D[:digit:]]`, "abcd", nil},
{`(?i)\W`, "x", nil},
{`(?i)\W`, "k", nil},
{`(?i)\W`, "s", nil},
// can backslash-escape any punctuation
{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`,
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
{"\\`", "`", build(1, 0, 1)},
{"[\\`]+", "`", build(1, 0, 1)},
{"\ufffd", "\xff", build(1, 0, 1)},
{"\ufffd", "hello\xffworld", build(1, 5, 6)},
{`.*`, "hello\xffworld", build(1, 0, 11)},
{`\x{fffd}`, "\xc2\x00", build(1, 0, 1)},
{"[\ufffd]", "\xff", build(1, 0, 1)},
{`[\x{fffd}]`, "\xc2\x00", build(1, 0, 1)},
// long set of matches (longer than startSize)
{
".",
"qwertyuiopasdfghjklzxcvbnm1234567890",
build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20,
20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30,
30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36),
},
}
// build is a helper to construct a [][]int by extracting n sequences from x.
// This represents n matches with len(x)/n submatches each.
func build(n int, x ...int) [][]int {
ret := make([][]int, n)
runLength := len(x) / n
j := 0
for i := range ret {
ret[i] = make([]int, runLength)
copy(ret[i], x[j:])
j += runLength
if j > len(x) {
panic("invalid build entry")
}
}
return ret
}
// First the simple cases.
func TestFind(t *testing.T) {
for _, test := range findTests {
re := MustCompile(test.pat)
if re.String() != test.pat {
t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
}
result := re.Find([]byte(test.text))
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
expect := test.text[test.matches[0][0]:test.matches[0][1]]
if len(result) != cap(result) {
t.Errorf("expected capacity %d got %d: %s", len(result), cap(result), test)
}
if expect != string(result) {
t.Errorf("expected %q got %q: %s", expect, result, test)
}
}
}
}
func TestFindString(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindString(test.text)
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != "":
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == "":
// Tricky because an empty result has two meanings: no match or empty match.
if test.matches[0][0] != test.matches[0][1] {
t.Errorf("expected match; got none: %s", test)
}
case test.matches != nil && result != "":
expect := test.text[test.matches[0][0]:test.matches[0][1]]
if expect != result {
t.Errorf("expected %q got %q: %s", expect, result, test)
}
}
}
}
func testFindIndex(test *FindTest, result []int, t *testing.T) {
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
expect := test.matches[0]
if expect[0] != result[0] || expect[1] != result[1] {
t.Errorf("expected %v got %v: %s", expect, result, test)
}
}
}
func TestFindIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t)
}
}
func TestFindStringIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t)
}
}
func TestFindReaderIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
}
}
// Now come the simple All cases.
func TestFindAll(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAll([]byte(test.text), -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Fatalf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
continue
}
for k, e := range test.matches {
got := result[k]
if len(got) != cap(got) {
t.Errorf("match %d: expected capacity %d got %d: %s", k, len(got), cap(got), test)
}
expect := test.text[e[0]:e[1]]
if expect != string(got) {
t.Errorf("match %d: expected %q got %q: %s", k, expect, got, test)
}
}
}
}
}
func TestFindAllString(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllString(test.text, -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
continue
}
for k, e := range test.matches {
expect := test.text[e[0]:e[1]]
if expect != result[k] {
t.Errorf("expected %q got %q: %s", expect, result, test)
}
}
}
}
}
func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
return
}
for k, e := range test.matches {
if e[0] != result[k][0] || e[1] != result[k][1] {
t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
}
}
}
}
func TestFindAllIndex(t *testing.T) {
for _, test := range findTests {
testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t)
}
}
func TestFindAllStringIndex(t *testing.T) {
for _, test := range findTests {
testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t)
}
}
// Now come the Submatch cases.
func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
if len(submatches) != len(result)*2 {
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
return
}
for k := 0; k < len(submatches); k += 2 {
if submatches[k] == -1 {
if result[k/2] != nil {
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
}
continue
}
got := result[k/2]
if len(got) != cap(got) {
t.Errorf("match %d: expected capacity %d got %d: %s", n, len(got), cap(got), test)
return
}
expect := test.text[submatches[k]:submatches[k+1]]
if expect != string(got) {
t.Errorf("match %d: expected %q got %q: %s", n, expect, got, test)
return
}
}
}
func TestFindSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindSubmatch([]byte(test.text))
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchBytes(&test, 0, test.matches[0], result, t)
}
}
}
func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
if len(submatches) != len(result)*2 {
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
return
}
for k := 0; k < len(submatches); k += 2 {
if submatches[k] == -1 {
if result[k/2] != "" {
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
}
continue
}
expect := test.text[submatches[k]:submatches[k+1]]
if expect != result[k/2] {
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
return
}
}
}
func TestFindStringSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindStringSubmatch(test.text)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchString(&test, 0, test.matches[0], result, t)
}
}
}
func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
if len(expect) != len(result) {
t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
return
}
for k, e := range expect {
if e != result[k] {
t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
}
}
}
func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchIndices(test, 0, test.matches[0], result, t)
}
}
func TestFindSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t)
}
}
func TestFindStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
}
}
func TestFindReaderSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
}
}
// Now come the monster AllSubmatch cases.
func TestFindAllSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchBytes(&test, k, match, result[k], t)
}
}
}
}
func TestFindAllStringSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchString(&test, k, match, result[k], t)
}
}
}
}
func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchIndices(test, k, match, result[k], t)
}
}
}
func TestFindAllSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t)
}
}
func TestFindAllStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
}
}

500
src/regexp/onepass.go Normal file
View File

@@ -0,0 +1,500 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"regexp/syntax"
"slices"
"strings"
"unicode"
"unicode/utf8"
)
// "One-pass" regexp execution.
// Some regexps can be analyzed to determine that they never need
// backtracking: they are guaranteed to run in one pass over the string
// without bothering to save all the usual NFA state.
// Detect those and execute them more quickly.
// A onePassProg is a compiled one-pass regular expression program.
// It is the same as syntax.Prog except for the use of onePassInst.
type onePassProg struct {
Inst []onePassInst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// A onePassInst is a single instruction in a one-pass regular expression program.
// It is the same as syntax.Inst except for the new 'Next' field.
type onePassInst struct {
syntax.Inst
Next []uint32
}
// onePassPrefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match. Pc is the index of the last rune instruction
// in the string. The onePassPrefix skips over the mandatory
// EmptyBeginText.
func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
i := &p.Inst[p.Start]
if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
return "", i.Op == syntax.InstMatch, uint32(p.Start)
}
pc = i.Out
i = &p.Inst[pc]
for i.Op == syntax.InstNop {
pc = i.Out
i = &p.Inst[pc]
}
// Avoid allocation of buffer if prefix is empty.
if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
return "", i.Op == syntax.InstMatch, uint32(p.Start)
}
// Have prefix; gather characters.
var buf strings.Builder
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
buf.WriteRune(i.Rune[0])
pc, i = i.Out, &p.Inst[i.Out]
}
if i.Op == syntax.InstEmptyWidth &&
syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
p.Inst[i.Out].Op == syntax.InstMatch {
complete = true
}
return buf.String(), complete, pc
}
// onePassNext selects the next actionable state of the prog, based on the input character.
// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
// One of the alternates may ultimately lead without input to end of line. If the instruction
// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
func onePassNext(i *onePassInst, r rune) uint32 {
next := i.MatchRunePos(r)
if next >= 0 {
return i.Next[next]
}
if i.Op == syntax.InstAltMatch {
return i.Out
}
return 0
}
func iop(i *syntax.Inst) syntax.InstOp {
op := i.Op
switch op {
case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
op = syntax.InstRune
}
return op
}
// Sparse Array implementation is used as a queueOnePass.
type queueOnePass struct {
sparse []uint32
dense []uint32
size, nextIndex uint32
}
func (q *queueOnePass) empty() bool {
return q.nextIndex >= q.size
}
func (q *queueOnePass) next() (n uint32) {
n = q.dense[q.nextIndex]
q.nextIndex++
return
}
func (q *queueOnePass) clear() {
q.size = 0
q.nextIndex = 0
}
func (q *queueOnePass) contains(u uint32) bool {
if u >= uint32(len(q.sparse)) {
return false
}
return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
}
func (q *queueOnePass) insert(u uint32) {
if !q.contains(u) {
q.insertNew(u)
}
}
func (q *queueOnePass) insertNew(u uint32) {
if u >= uint32(len(q.sparse)) {
return
}
q.sparse[u] = q.size
q.dense[q.size] = u
q.size++
}
func newQueue(size int) (q *queueOnePass) {
return &queueOnePass{
sparse: make([]uint32, size),
dense: make([]uint32, size),
}
}
// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
// NextIp array with the single element mergeFailed is returned.
// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
const mergeFailed = uint32(0xffffffff)
var (
noRune = []rune{}
noNext = []uint32{mergeFailed}
)
func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
leftLen := len(*leftRunes)
rightLen := len(*rightRunes)
if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
panic("mergeRuneSets odd length []rune")
}
var (
lx, rx int
)
merged := make([]rune, 0)
next := make([]uint32, 0)
ok := true
defer func() {
if !ok {
merged = nil
next = nil
}
}()
ix := -1
extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
return false
}
merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
*newLow += 2
ix += 2
next = append(next, pc)
return true
}
for lx < leftLen || rx < rightLen {
switch {
case rx >= rightLen:
ok = extend(&lx, leftRunes, leftPC)
case lx >= leftLen:
ok = extend(&rx, rightRunes, rightPC)
case (*rightRunes)[rx] < (*leftRunes)[lx]:
ok = extend(&rx, rightRunes, rightPC)
default:
ok = extend(&lx, leftRunes, leftPC)
}
if !ok {
return noRune, noNext
}
}
return merged, next
}
// cleanupOnePass drops working memory, and restores certain shortcut instructions.
func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
for ix, instOriginal := range original.Inst {
switch instOriginal.Op {
case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
prog.Inst[ix].Next = nil
case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
prog.Inst[ix].Next = nil
prog.Inst[ix] = onePassInst{Inst: instOriginal}
}
}
}
// onePassCopy creates a copy of the original Prog, as we'll be modifying it.
func onePassCopy(prog *syntax.Prog) *onePassProg {
p := &onePassProg{
Start: prog.Start,
NumCap: prog.NumCap,
Inst: make([]onePassInst, len(prog.Inst)),
}
for i, inst := range prog.Inst {
p.Inst[i] = onePassInst{Inst: inst}
}
// rewrites one or more common Prog constructs that enable some otherwise
// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
// ip A, that points to ips B & C.
// A:BC + B:DA => A:BC + B:CD
// A:BC + B:DC => A:DC + B:DC
for pc := range p.Inst {
switch p.Inst[pc].Op {
default:
continue
case syntax.InstAlt, syntax.InstAltMatch:
// A:Bx + B:Ay
p_A_Other := &p.Inst[pc].Out
p_A_Alt := &p.Inst[pc].Arg
// make sure a target is another Alt
instAlt := p.Inst[*p_A_Alt]
if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
instAlt = p.Inst[*p_A_Alt]
if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
continue
}
}
instOther := p.Inst[*p_A_Other]
// Analyzing both legs pointing to Alts is for another day
if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
// too complicated
continue
}
// simple empty transition loop
// A:BC + B:DA => A:BC + B:DC
p_B_Alt := &p.Inst[*p_A_Alt].Out
p_B_Other := &p.Inst[*p_A_Alt].Arg
patch := false
if instAlt.Out == uint32(pc) {
patch = true
} else if instAlt.Arg == uint32(pc) {
patch = true
p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
}
if patch {
*p_B_Alt = *p_A_Other
}
// empty transition to common target
// A:BC + B:DC => A:DC + B:DC
if *p_A_Other == *p_B_Alt {
*p_A_Alt = *p_B_Other
}
}
}
return p
}
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []rune{0, unicode.MaxRune}
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
// the match engine can always tell which branch to take. The routine may modify
// p if it is turned into a onepass Prog. If it isn't possible for this to be a
// onepass Prog, the Prog nil is returned. makeOnePass is recursive
// to the size of the Prog.
func makeOnePass(p *onePassProg) *onePassProg {
// If the machine is very long, it's not worth the time to check if we can use one pass.
if len(p.Inst) >= 1000 {
return nil
}
var (
instQueue = newQueue(len(p.Inst))
visitQueue = newQueue(len(p.Inst))
check func(uint32, []bool) bool
onePassRunes = make([][]rune, len(p.Inst))
)
// check that paths from Alt instructions are unambiguous, and rebuild the new
// program as a onepass program
check = func(pc uint32, m []bool) (ok bool) {
ok = true
inst := &p.Inst[pc]
if visitQueue.contains(pc) {
return
}
visitQueue.insert(pc)
switch inst.Op {
case syntax.InstAlt, syntax.InstAltMatch:
ok = check(inst.Out, m) && check(inst.Arg, m)
// check no-input paths to InstMatch
matchOut := m[inst.Out]
matchArg := m[inst.Arg]
if matchOut && matchArg {
ok = false
break
}
// Match on empty goes in inst.Out
if matchArg {
inst.Out, inst.Arg = inst.Arg, inst.Out
matchOut, matchArg = matchArg, matchOut
}
if matchOut {
m[pc] = true
inst.Op = syntax.InstAltMatch
}
// build a dispatch operator from the two legs of the alt.
onePassRunes[pc], inst.Next = mergeRuneSets(
&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
ok = false
break
}
case syntax.InstCapture, syntax.InstNop:
ok = check(inst.Out, m)
m[pc] = m[inst.Out]
// pass matching runes back through these no-ops.
onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
case syntax.InstEmptyWidth:
ok = check(inst.Out, m)
m[pc] = m[inst.Out]
onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
case syntax.InstMatch, syntax.InstFail:
m[pc] = inst.Op == syntax.InstMatch
case syntax.InstRune:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
if len(inst.Rune) == 0 {
onePassRunes[pc] = []rune{}
inst.Next = []uint32{inst.Out}
break
}
runes := make([]rune, 0)
if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
r0 := inst.Rune[0]
runes = append(runes, r0, r0)
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
runes = append(runes, r1, r1)
}
slices.Sort(runes)
} else {
runes = append(runes, inst.Rune...)
}
onePassRunes[pc] = runes
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
inst.Op = syntax.InstRune
case syntax.InstRune1:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
runes := []rune{}
// expand case-folded runes
if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
r0 := inst.Rune[0]
runes = append(runes, r0, r0)
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
runes = append(runes, r1, r1)
}
slices.Sort(runes)
} else {
runes = append(runes, inst.Rune[0], inst.Rune[0])
}
onePassRunes[pc] = runes
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
inst.Op = syntax.InstRune
case syntax.InstRuneAny:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
onePassRunes[pc] = append([]rune{}, anyRune...)
inst.Next = []uint32{inst.Out}
case syntax.InstRuneAnyNotNL:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
}
return
}
instQueue.clear()
instQueue.insert(uint32(p.Start))
m := make([]bool, len(p.Inst))
for !instQueue.empty() {
visitQueue.clear()
pc := instQueue.next()
if !check(pc, m) {
p = nil
break
}
}
if p != nil {
for i := range p.Inst {
p.Inst[i].Rune = onePassRunes[i]
}
}
return p
}
// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
// can be recharacterized as a one-pass regexp program, or syntax.nil if the
// Prog cannot be converted. For a one pass prog, the fundamental condition that must
// be true is: at any InstAlt, there must be no ambiguity about what branch to take.
func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
if prog.Start == 0 {
return nil
}
// onepass regexp is anchored
if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
return nil
}
// every instruction leading to InstMatch must be EmptyEndText
for _, inst := range prog.Inst {
opOut := prog.Inst[inst.Out].Op
switch inst.Op {
default:
if opOut == syntax.InstMatch {
return nil
}
case syntax.InstAlt, syntax.InstAltMatch:
if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
return nil
}
case syntax.InstEmptyWidth:
if opOut == syntax.InstMatch {
if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
continue
}
return nil
}
}
}
// Creates a slightly optimized copy of the original Prog
// that cleans up some Prog idioms that block valid onepass programs
p = onePassCopy(prog)
// checkAmbiguity on InstAlts, build onepass Prog if possible
p = makeOnePass(p)
if p != nil {
cleanupOnePass(p, prog)
}
return p
}

225
src/regexp/onepass_test.go Normal file
View File

@@ -0,0 +1,225 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"regexp/syntax"
"slices"
"strings"
"testing"
)
var runeMergeTests = []struct {
left, right, merged []rune
next []uint32
leftPC, rightPC uint32
}{
{
// empty rhs
[]rune{69, 69},
[]rune{},
[]rune{69, 69},
[]uint32{1},
1, 2,
},
{
// identical runes, identical targets
[]rune{69, 69},
[]rune{69, 69},
[]rune{},
[]uint32{mergeFailed},
1, 1,
},
{
// identical runes, different targets
[]rune{69, 69},
[]rune{69, 69},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// append right-first
[]rune{69, 69},
[]rune{71, 71},
[]rune{69, 69, 71, 71},
[]uint32{1, 2},
1, 2,
},
{
// append, left-first
[]rune{71, 71},
[]rune{69, 69},
[]rune{69, 69, 71, 71},
[]uint32{2, 1},
1, 2,
},
{
// successful interleave
[]rune{60, 60, 71, 71, 101, 101},
[]rune{69, 69, 88, 88},
[]rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
[]uint32{1, 2, 1, 2, 1},
1, 2,
},
{
// left surrounds right
[]rune{69, 74},
[]rune{71, 71},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// right surrounds left
[]rune{69, 74},
[]rune{68, 75},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// overlap at interval begin
[]rune{69, 74},
[]rune{74, 75},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// overlap ar interval end
[]rune{69, 74},
[]rune{65, 69},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// overlap from above
[]rune{69, 74},
[]rune{71, 74},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// overlap from below
[]rune{69, 74},
[]rune{65, 71},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
{
// out of order []rune
[]rune{69, 74, 60, 65},
[]rune{66, 67},
[]rune{},
[]uint32{mergeFailed},
1, 2,
},
}
func TestMergeRuneSet(t *testing.T) {
for ix, test := range runeMergeTests {
merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
if !slices.Equal(merged, test.merged) {
t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
}
if !slices.Equal(next, test.next) {
t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
}
}
}
var onePassTests = []struct {
re string
isOnePass bool
}{
{`^(?:a|(?:a*))$`, false},
{`^(?:(a)|(?:a*))$`, false},
{`^(?:(?:(?:.(?:$))?))$`, true},
{`^abcd$`, true},
{`^(?:(?:a{0,})*?)$`, false},
{`^(?:(?:a+)*)$`, true},
{`^(?:(?:a|(?:aa)))$`, true},
{`^(?:[^\s\S])$`, true},
{`^(?:(?:a{3,4}){0,})$`, false},
{`^(?:(?:(?:a*)+))$`, true},
{`^[a-c]+$`, true},
{`^[a-c]*$`, true},
{`^(?:a*)$`, true},
{`^(?:(?:aa)|a)$`, true},
{`^[a-c]*`, false},
{`^...$`, true},
{`^(?:a|(?:aa))$`, true},
{`^a((b))c$`, true},
{`^a.[l-nA-Cg-j]?e$`, true},
{`^a((b))$`, true},
{`^a(?:(b)|(c))c$`, true},
{`^a(?:(b*)|(c))c$`, false},
{`^a(?:b|c)$`, true},
{`^a(?:b?|c)$`, true},
{`^a(?:b?|c?)$`, false},
{`^a(?:b?|c+)$`, true},
{`^a(?:b+|(bc))d$`, false},
{`^a(?:bc)+$`, true},
{`^a(?:[bcd])+$`, true},
{`^a((?:[bcd])+)$`, true},
{`^a(:?b|c)*d$`, true},
{`^.bc(d|e)*$`, true},
{`^(?:(?:aa)|.)$`, false},
{`^(?:(?:a{1,2}){1,2})$`, false},
{`^l` + strings.Repeat("o", 2<<8) + `ng$`, true},
}
func TestCompileOnePass(t *testing.T) {
var (
p *syntax.Prog
re *syntax.Regexp
err error
)
for _, test := range onePassTests {
if re, err = syntax.Parse(test.re, syntax.Perl); err != nil {
t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
continue
}
// needs to be done before compile...
re = re.Simplify()
if p, err = syntax.Compile(re); err != nil {
t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
continue
}
isOnePass := compileOnePass(p) != nil
if isOnePass != test.isOnePass {
t.Errorf("CompileOnePass(%q) got isOnePass=%v, expected %v", test.re, isOnePass, test.isOnePass)
}
}
}
// TODO(cespare): Unify with onePassTests and rationalize one-pass test cases.
var onePassTests1 = []struct {
re string
match string
}{
{`^a(/b+(#c+)*)*$`, "a/b#c"}, // golang.org/issue/11905
}
func TestRunOnePass(t *testing.T) {
for _, test := range onePassTests1 {
re, err := Compile(test.re)
if err != nil {
t.Errorf("Compile(%q): got err: %s", test.re, err)
continue
}
if re.onepass == nil {
t.Errorf("Compile(%q): got nil, want one-pass", test.re)
continue
}
if !re.MatchString(test.match) {
t.Errorf("onepass %q did not match %q", test.re, test.match)
}
}
}

1299
src/regexp/regexp.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,296 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "unicode"
// A patchList is a list of instruction pointers that need to be filled in (patched).
// Because the pointers haven't been filled in yet, we can reuse their storage
// to hold the list. It's kind of sleazy, but works well in practice.
// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
//
// These aren't really pointers: they're integers, so we can reinterpret them
// this way without using package unsafe. A value l.head denotes
// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
// head == 0 denotes the empty list, okay because we start every program
// with a fail instruction, so we'll never want to point at its output link.
type patchList struct {
head, tail uint32
}
func makePatchList(n uint32) patchList {
return patchList{n, n}
}
func (l patchList) patch(p *Prog, val uint32) {
head := l.head
for head != 0 {
i := &p.Inst[head>>1]
if head&1 == 0 {
head = i.Out
i.Out = val
} else {
head = i.Arg
i.Arg = val
}
}
}
func (l1 patchList) append(p *Prog, l2 patchList) patchList {
if l1.head == 0 {
return l2
}
if l2.head == 0 {
return l1
}
i := &p.Inst[l1.tail>>1]
if l1.tail&1 == 0 {
i.Out = l2.head
} else {
i.Arg = l2.head
}
return patchList{l1.head, l2.tail}
}
// A frag represents a compiled program fragment.
type frag struct {
i uint32 // index of first instruction
out patchList // where to record end instruction
nullable bool // whether fragment can match empty string
}
type compiler struct {
p *Prog
}
// Compile compiles the regexp into a program to be executed.
// The regexp should have been simplified already (returned from re.Simplify).
func Compile(re *Regexp) (*Prog, error) {
var c compiler
c.init()
f := c.compile(re)
f.out.patch(c.p, c.inst(InstMatch).i)
c.p.Start = int(f.i)
return c.p, nil
}
func (c *compiler) init() {
c.p = new(Prog)
c.p.NumCap = 2 // implicit ( and ) for whole match $0
c.inst(InstFail)
}
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []rune{0, unicode.MaxRune}
func (c *compiler) compile(re *Regexp) frag {
switch re.Op {
case OpNoMatch:
return c.fail()
case OpEmptyMatch:
return c.nop()
case OpLiteral:
if len(re.Rune) == 0 {
return c.nop()
}
var f frag
for j := range re.Rune {
f1 := c.rune(re.Rune[j:j+1], re.Flags)
if j == 0 {
f = f1
} else {
f = c.cat(f, f1)
}
}
return f
case OpCharClass:
return c.rune(re.Rune, re.Flags)
case OpAnyCharNotNL:
return c.rune(anyRuneNotNL, 0)
case OpAnyChar:
return c.rune(anyRune, 0)
case OpBeginLine:
return c.empty(EmptyBeginLine)
case OpEndLine:
return c.empty(EmptyEndLine)
case OpBeginText:
return c.empty(EmptyBeginText)
case OpEndText:
return c.empty(EmptyEndText)
case OpWordBoundary:
return c.empty(EmptyWordBoundary)
case OpNoWordBoundary:
return c.empty(EmptyNoWordBoundary)
case OpCapture:
bra := c.cap(uint32(re.Cap << 1))
sub := c.compile(re.Sub[0])
ket := c.cap(uint32(re.Cap<<1 | 1))
return c.cat(c.cat(bra, sub), ket)
case OpStar:
return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpPlus:
return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpQuest:
return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpConcat:
if len(re.Sub) == 0 {
return c.nop()
}
var f frag
for i, sub := range re.Sub {
if i == 0 {
f = c.compile(sub)
} else {
f = c.cat(f, c.compile(sub))
}
}
return f
case OpAlternate:
var f frag
for _, sub := range re.Sub {
f = c.alt(f, c.compile(sub))
}
return f
}
panic("regexp: unhandled case in compile")
}
func (c *compiler) inst(op InstOp) frag {
// TODO: impose length limit
f := frag{i: uint32(len(c.p.Inst)), nullable: true}
c.p.Inst = append(c.p.Inst, Inst{Op: op})
return f
}
func (c *compiler) nop() frag {
f := c.inst(InstNop)
f.out = makePatchList(f.i << 1)
return f
}
func (c *compiler) fail() frag {
return frag{}
}
func (c *compiler) cap(arg uint32) frag {
f := c.inst(InstCapture)
f.out = makePatchList(f.i << 1)
c.p.Inst[f.i].Arg = arg
if c.p.NumCap < int(arg)+1 {
c.p.NumCap = int(arg) + 1
}
return f
}
func (c *compiler) cat(f1, f2 frag) frag {
// concat of failure is failure
if f1.i == 0 || f2.i == 0 {
return frag{}
}
// TODO: elide nop
f1.out.patch(c.p, f2.i)
return frag{f1.i, f2.out, f1.nullable && f2.nullable}
}
func (c *compiler) alt(f1, f2 frag) frag {
// alt of failure is other
if f1.i == 0 {
return f2
}
if f2.i == 0 {
return f1
}
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
i.Out = f1.i
i.Arg = f2.i
f.out = f1.out.append(c.p, f2.out)
f.nullable = f1.nullable || f2.nullable
return f
}
func (c *compiler) quest(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
i.Arg = f1.i
f.out = makePatchList(f.i << 1)
} else {
i.Out = f1.i
f.out = makePatchList(f.i<<1 | 1)
}
f.out = f.out.append(c.p, f1.out)
return f
}
// loop returns the fragment for the main loop of a plus or star.
// For plus, it can be used after changing the entry to f1.i.
// For star, it can be used directly when f1 can't match an empty string.
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
// to get the priority match order correct.)
func (c *compiler) loop(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
i.Arg = f1.i
f.out = makePatchList(f.i << 1)
} else {
i.Out = f1.i
f.out = makePatchList(f.i<<1 | 1)
}
f1.out.patch(c.p, f.i)
return f
}
func (c *compiler) star(f1 frag, nongreedy bool) frag {
if f1.nullable {
// Use (f1+)? to get priority match order correct.
// See golang.org/issue/46123.
return c.quest(c.plus(f1, nongreedy), nongreedy)
}
return c.loop(f1, nongreedy)
}
func (c *compiler) plus(f1 frag, nongreedy bool) frag {
return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
}
func (c *compiler) empty(op EmptyOp) frag {
f := c.inst(InstEmptyWidth)
c.p.Inst[f.i].Arg = uint32(op)
f.out = makePatchList(f.i << 1)
return f
}
func (c *compiler) rune(r []rune, flags Flags) frag {
f := c.inst(InstRune)
f.nullable = false
i := &c.p.Inst[f.i]
i.Rune = r
flags &= FoldCase // only relevant flag is FoldCase
if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
// and sometimes not even that
flags &^= FoldCase
}
i.Arg = uint32(flags)
f.out = makePatchList(f.i << 1)
// Special cases for exec machine.
switch {
case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
i.Op = InstRune1
case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
i.Op = InstRuneAny
case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
i.Op = InstRuneAnyNotNL
}
return f
}

142
src/regexp/syntax/doc.go Normal file
View File

@@ -0,0 +1,142 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by mksyntaxgo from the RE2 distribution. DO NOT EDIT.
/*
Package syntax parses regular expressions into parse trees and compiles
parse trees into programs. Most clients of regular expressions will use the
facilities of package [regexp] (such as [regexp.Compile] and [regexp.Match]) instead of this package.
# Syntax
The regular expression syntax understood by this package when parsing with the [Perl] flag is as follows.
Parts of the syntax can be disabled by passing alternate flags to [Parse].
Single characters:
. any character, possibly including newline (flag s=true)
[xyz] character class
[^xyz] negated character class
\d Perl character class
\D negated Perl character class
[[:alpha:]] ASCII character class
[[:^alpha:]] negated ASCII character class
\pN Unicode character class (one-letter name)
\p{Greek} Unicode character class
\PN negated Unicode character class (one-letter name)
\P{Greek} negated Unicode character class
Composites:
xy x followed by y
x|y x or y (prefer x)
Repetitions:
x* zero or more x, prefer more
x+ one or more x, prefer more
x? zero or one x, prefer one
x{n,m} n or n+1 or ... or m x, prefer more
x{n,} n or more x, prefer more
x{n} exactly n x
x*? zero or more x, prefer fewer
x+? one or more x, prefer fewer
x?? zero or one x, prefer zero
x{n,m}? n or n+1 or ... or m x, prefer fewer
x{n,}? n or more x, prefer fewer
x{n}? exactly n x
Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
reject forms that create a minimum or maximum repetition count above 1000.
Unlimited repetitions are not subject to this restriction.
Grouping:
(re) numbered capturing group (submatch)
(?P<name>re) named & numbered capturing group (submatch)
(?<name>re) named & numbered capturing group (submatch)
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
(?flags:re) set flags during re; non-capturing
Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
i case-insensitive (default false)
m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
s let . match \n (default false)
U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
Empty strings:
^ at beginning of text or line (flag m=true)
$ at end of text (like \z not \Z) or line (flag m=true)
\A at beginning of text
\b at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
\B not at ASCII word boundary
\z at end of text
Escape sequences:
\a bell (== \007)
\f form feed (== \014)
\t horizontal tab (== \011)
\n newline (== \012)
\r carriage return (== \015)
\v vertical tab character (== \013)
\* literal *, for any punctuation character *
\123 octal character code (up to three digits)
\x7F hex character code (exactly two digits)
\x{10FFFF} hex character code
\Q...\E literal text ... even if ... has punctuation
Character class elements:
x single character
A-Z character range (inclusive)
\d Perl character class
[:foo:] ASCII character class foo
\p{Foo} Unicode character class Foo
\pF Unicode character class F (one-letter name)
Named character classes as character class elements:
[\d] digits (== \d)
[^\d] not digits (== \D)
[\D] not digits (== \D)
[^\D] not not digits (== \d)
[[:name:]] named ASCII class inside character class (== [:name:])
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
[\p{Name}] named Unicode property inside character class (== \p{Name})
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
Perl character classes (all ASCII-only):
\d digits (== [0-9])
\D not digits (== [^0-9])
\s whitespace (== [\t\n\f\r ])
\S not whitespace (== [^\t\n\f\r ])
\w word characters (== [0-9A-Za-z_])
\W not word characters (== [^0-9A-Za-z_])
ASCII character classes:
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
[[:alpha:]] alphabetic (== [A-Za-z])
[[:ascii:]] ASCII (== [\x00-\x7F])
[[:blank:]] blank (== [\t ])
[[:cntrl:]] control (== [\x00-\x1F\x7F])
[[:digit:]] digits (== [0-9])
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[[:lower:]] lower case (== [a-z])
[[:print:]] printable (== [ -~] == [ [:graph:]])
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
[[:space:]] whitespace (== [\t\n\v\f\r ])
[[:upper:]] upper case (== [A-Z])
[[:word:]] word characters (== [0-9A-Za-z_])
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
*/
package syntax

View File

@@ -0,0 +1,128 @@
#!/usr/bin/perl
# Copyright 2008 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Modified version of RE2's make_perl_groups.pl.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
use strict;
use warnings;
my @posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
my @perlclasses = (
"\\d",
"\\s",
"\\w",
);
my %overrides = (
# Prior to Perl 5.18, \s did not match vertical tab.
# RE2 preserves that original behaviour.
"\\s:11" => 0,
);
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "var code$cname = []rune{ /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
}
print "}\n\n";
my $n = @ranges;
my $negname = $name;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "\t`$name`: {+1, code$cname},\n" .
"\t`$negname`: {-1, code$cname},\n";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "var ${cname}Group = map[string]charGroup{\n";
foreach my $e (@entries) {
print $e;
}
print "}\n";
my $count = @entries;
}
# Prepare gofmt command
my $gofmt;
if (@ARGV > 0 && $ARGV[0] =~ /\.go$/) {
# Send the output of gofmt to the given file
open($gofmt, '|-', 'gofmt >'.$ARGV[0]) or die;
} else {
open($gofmt, '|-', 'gofmt') or die;
}
# Redirect STDOUT to gofmt input
select $gofmt;
print <<EOF;
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by make_perl_groups.pl; DO NOT EDIT.
package syntax
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);

View File

@@ -0,0 +1,52 @@
// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
package syntax
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[OpNoMatch-1]
_ = x[OpEmptyMatch-2]
_ = x[OpLiteral-3]
_ = x[OpCharClass-4]
_ = x[OpAnyCharNotNL-5]
_ = x[OpAnyChar-6]
_ = x[OpBeginLine-7]
_ = x[OpEndLine-8]
_ = x[OpBeginText-9]
_ = x[OpEndText-10]
_ = x[OpWordBoundary-11]
_ = x[OpNoWordBoundary-12]
_ = x[OpCapture-13]
_ = x[OpStar-14]
_ = x[OpPlus-15]
_ = x[OpQuest-16]
_ = x[OpRepeat-17]
_ = x[OpConcat-18]
_ = x[OpAlternate-19]
_ = x[opPseudo-128]
}
const (
_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
_Op_name_1 = "opPseudo"
)
var (
_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
)
func (i Op) String() string {
switch {
case 1 <= i && i <= 19:
i -= 1
return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
case i == 128:
return _Op_name_1
default:
return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
}
}

2134
src/regexp/syntax/parse.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,628 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"fmt"
"strings"
"testing"
"unicode"
)
type parseTest struct {
Regexp string
Dump string
}
var parseTests = []parseTest{
// Base cases
{`a`, `lit{a}`},
{`a.`, `cat{lit{a}dot{}}`},
{`a.b`, `cat{lit{a}dot{}lit{b}}`},
{`ab`, `str{ab}`},
{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
{`abc`, `str{abc}`},
{`a|^`, `alt{lit{a}bol{}}`},
{`a|b`, `cc{0x61-0x62}`},
{`(a)`, `cap{lit{a}}`},
{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
{`a*`, `star{lit{a}}`},
{`a+`, `plus{lit{a}}`},
{`a?`, `que{lit{a}}`},
{`a{2}`, `rep{2,2 lit{a}}`},
{`a{2,3}`, `rep{2,3 lit{a}}`},
{`a{2,}`, `rep{2,-1 lit{a}}`},
{`a*?`, `nstar{lit{a}}`},
{`a+?`, `nplus{lit{a}}`},
{`a??`, `nque{lit{a}}`},
{`a{2}?`, `nrep{2,2 lit{a}}`},
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
// Malformed { } are treated as literals.
{`x{1001`, `str{x{1001}`},
{`x{9876543210`, `str{x{9876543210}`},
{`x{9876543210,`, `str{x{9876543210,}`},
{`x{2,1`, `str{x{2,1}`},
{`x{1,9876543210`, `str{x{1,9876543210}`},
{``, `emp{}`},
{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
{`|x|`, `alt{emp{}lit{x}emp{}}`},
{`.`, `dot{}`},
{`^`, `bol{}`},
{`$`, `eol{}`},
{`\|`, `lit{|}`},
{`\(`, `lit{(}`},
{`\)`, `lit{)}`},
{`\*`, `lit{*}`},
{`\+`, `lit{+}`},
{`\?`, `lit{?}`},
{`{`, `lit{{}`},
{`}`, `lit{}}`},
{`\.`, `lit{.}`},
{`\^`, `lit{^}`},
{`\$`, `lit{$}`},
{`\\`, `lit{\}`},
{`[ace]`, `cc{0x61 0x63 0x65}`},
{`[abc]`, `cc{0x61-0x63}`},
{`[a-z]`, `cc{0x61-0x7a}`},
{`[a]`, `lit{a}`},
{`\-`, `lit{-}`},
{`-`, `lit{-}`},
{`\_`, `lit{_}`},
{`abc`, `str{abc}`},
{`abc|def`, `alt{str{abc}str{def}}`},
{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
// Posix and Perl extensions
{`[[:lower:]]`, `cc{0x61-0x7a}`},
{`[a-z]`, `cc{0x61-0x7a}`},
{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
{`\d`, `cc{0x30-0x39}`},
{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
// { `\C`, `byte{}` }, // probably never
// Unicode, negatives, and a double negative.
{`\p{Braille}`, `cc{0x2800-0x28ff}`},
{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
{`\p{Any}`, `dot{}`},
{`\p{^Any}`, `cc{}`},
// Hex, octal.
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
// More interesting regular expressions.
{`a{,2}`, `str{a{,2}}`},
{`\.\^\$\\`, `str{.^$\}`},
{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
{`a*{`, `cat{star{lit{a}}lit{{}}`},
// Test precedences
{`(?:ab)*`, `star{str{ab}}`},
{`(ab)*`, `star{cap{str{ab}}}`},
{`ab|cd`, `alt{str{ab}str{cd}}`},
{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
// Test flattening.
{`(?:a)`, `lit{a}`},
{`(?:ab)(?:cd)`, `str{abcd}`},
{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
{`a|.`, `dot{}`},
{`.|a`, `dot{}`},
{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
// Test Perl quoted literals
{`\Q+|*?{[\E`, `str{+|*?{[}`},
{`\Q+\E+`, `plus{lit{+}}`},
{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
{`\Q\\E`, `lit{\}`},
{`\Q\\\E`, `str{\\}`},
// Test Perl \A and \z
{`(?m)^`, `bol{}`},
{`(?m)$`, `eol{}`},
{`(?-m)^`, `bot{}`},
{`(?-m)$`, `eot{}`},
{`(?m)\A`, `bot{}`},
{`(?m)\z`, `eot{\z}`},
{`(?-m)\A`, `bot{}`},
{`(?-m)\z`, `eot{\z}`},
// Test named captures
{`(?P<name>a)`, `cap{name:lit{a}}`},
{`(?<name>a)`, `cap{name:lit{a}}`},
// Case-folded literals
{`[Aa]`, `litfold{A}`},
{`[\x{100}\x{101}]`, `litfold{Ā}`},
{`[Δδ]`, `litfold{Δ}`},
// Strings
{`abcde`, `str{abcde}`},
{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
// Factoring.
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
// Bug fixes.
{`(?:.)`, `dot{}`},
{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
{`(?:A|a)`, `litfold{A}`},
{`A|(?:A|a)`, `litfold{A}`},
{`(?s).`, `dot{}`},
{`(?-s).`, `dnl{}`},
{`(?:(?:^).)`, `cat{bol{}dot{}}`},
{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
{`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`},
// RE2 prefix_tests
{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
{`abc|abd|aef|bcx|bcy`,
`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
`cat{str{bc}cc{0x78-0x79}}}`},
{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
{`x{2}|x{2}[0-9]`,
`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
{`x{2}y|x{2}[0-9]y`,
`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
{`a.*?c|a.*?b`,
`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
// Valid repetitions.
{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
// Valid nesting.
{strings.Repeat("(", 999) + strings.Repeat(")", 999), ``},
{strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``},
{"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all
}
const testFlags = MatchNL | PerlX | UnicodeGroups
func TestParseSimple(t *testing.T) {
testParseDump(t, parseTests, testFlags)
}
var foldcaseTests = []parseTest{
{`AbCdE`, `strfold{ABCDE}`},
{`[Aa]`, `litfold{A}`},
{`a`, `litfold{A}`},
// 0x17F is an old English long s (looks like an f) and folds to s.
// 0x212A is the Kelvin symbol and folds to k.
{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
}
func TestParseFoldCase(t *testing.T) {
testParseDump(t, foldcaseTests, FoldCase)
}
var literalTests = []parseTest{
{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
}
func TestParseLiteral(t *testing.T) {
testParseDump(t, literalTests, Literal)
}
var matchnlTests = []parseTest{
{`.`, `dot{}`},
{"\n", "lit{\n}"},
{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
{`[a\n]`, `cc{0xa 0x61}`},
}
func TestParseMatchNL(t *testing.T) {
testParseDump(t, matchnlTests, MatchNL)
}
var nomatchnlTests = []parseTest{
{`.`, `dnl{}`},
{"\n", "lit{\n}"},
{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
{`[a\n]`, `cc{0xa 0x61}`},
}
func TestParseNoMatchNL(t *testing.T) {
testParseDump(t, nomatchnlTests, 0)
}
// Test Parse -> Dump.
func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
for _, tt := range tests {
re, err := Parse(tt.Regexp, flags)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
continue
}
if tt.Dump == "" {
// It parsed. That's all we care about.
continue
}
d := dump(re)
if d != tt.Dump {
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
}
}
}
// dump prints a string representation of the regexp showing
// the structure explicitly.
func dump(re *Regexp) string {
var b strings.Builder
dumpRegexp(&b, re)
return b.String()
}
var opNames = []string{
OpNoMatch: "no",
OpEmptyMatch: "emp",
OpLiteral: "lit",
OpCharClass: "cc",
OpAnyCharNotNL: "dnl",
OpAnyChar: "dot",
OpBeginLine: "bol",
OpEndLine: "eol",
OpBeginText: "bot",
OpEndText: "eot",
OpWordBoundary: "wb",
OpNoWordBoundary: "nwb",
OpCapture: "cap",
OpStar: "star",
OpPlus: "plus",
OpQuest: "que",
OpRepeat: "rep",
OpConcat: "cat",
OpAlternate: "alt",
}
// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
// It is used during testing to distinguish between parses that might print
// the same using re's String method.
func dumpRegexp(b *strings.Builder, re *Regexp) {
if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
fmt.Fprintf(b, "op%d", re.Op)
} else {
switch re.Op {
default:
b.WriteString(opNames[re.Op])
case OpStar, OpPlus, OpQuest, OpRepeat:
if re.Flags&NonGreedy != 0 {
b.WriteByte('n')
}
b.WriteString(opNames[re.Op])
case OpLiteral:
if len(re.Rune) > 1 {
b.WriteString("str")
} else {
b.WriteString("lit")
}
if re.Flags&FoldCase != 0 {
for _, r := range re.Rune {
if unicode.SimpleFold(r) != r {
b.WriteString("fold")
break
}
}
}
}
}
b.WriteByte('{')
switch re.Op {
case OpEndText:
if re.Flags&WasDollar == 0 {
b.WriteString(`\z`)
}
case OpLiteral:
for _, r := range re.Rune {
b.WriteRune(r)
}
case OpConcat, OpAlternate:
for _, sub := range re.Sub {
dumpRegexp(b, sub)
}
case OpStar, OpPlus, OpQuest:
dumpRegexp(b, re.Sub[0])
case OpRepeat:
fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
dumpRegexp(b, re.Sub[0])
case OpCapture:
if re.Name != "" {
b.WriteString(re.Name)
b.WriteByte(':')
}
dumpRegexp(b, re.Sub[0])
case OpCharClass:
sep := ""
for i := 0; i < len(re.Rune); i += 2 {
b.WriteString(sep)
sep = " "
lo, hi := re.Rune[i], re.Rune[i+1]
if lo == hi {
fmt.Fprintf(b, "%#x", lo)
} else {
fmt.Fprintf(b, "%#x-%#x", lo, hi)
}
}
}
b.WriteByte('}')
}
func mkCharClass(f func(rune) bool) string {
re := &Regexp{Op: OpCharClass}
lo := rune(-1)
for i := rune(0); i <= unicode.MaxRune; i++ {
if f(i) {
if lo < 0 {
lo = i
}
} else {
if lo >= 0 {
re.Rune = append(re.Rune, lo, i-1)
lo = -1
}
}
}
if lo >= 0 {
re.Rune = append(re.Rune, lo, unicode.MaxRune)
}
return dump(re)
}
func isUpperFold(r rune) bool {
if unicode.IsUpper(r) {
return true
}
c := unicode.SimpleFold(r)
for c != r {
if unicode.IsUpper(c) {
return true
}
c = unicode.SimpleFold(c)
}
return false
}
func TestFoldConstants(t *testing.T) {
last := rune(-1)
for i := rune(0); i <= unicode.MaxRune; i++ {
if unicode.SimpleFold(i) == i {
continue
}
if last == -1 && minFold != i {
t.Errorf("minFold=%#U should be %#U", minFold, i)
}
last = i
}
if maxFold != last {
t.Errorf("maxFold=%#U should be %#U", maxFold, last)
}
}
func TestAppendRangeCollapse(t *testing.T) {
// AppendRange should collapse each of the new ranges
// into the earlier ones (it looks back two ranges), so that
// the slice never grows very large.
// Note that we are not calling cleanClass.
var r []rune
for i := rune('A'); i <= 'Z'; i++ {
r = appendRange(r, i, i)
r = appendRange(r, i+'a'-'A', i+'a'-'A')
}
if string(r) != "AZaz" {
t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
}
}
var invalidRegexps = []string{
`(`,
`)`,
`(a`,
`a)`,
`(a))`,
`(a|b|`,
`a|b|)`,
`(a|b|))`,
`(a|b`,
`a|b)`,
`(a|b))`,
`[a-z`,
`([a-z)`,
`[a-z)`,
`([a-z]))`,
`x{1001}`,
`x{9876543210}`,
`x{2,1}`,
`x{1,9876543210}`,
"\xff", // Invalid UTF-8
"[\xff]",
"[\\\xff]",
"\\\xff",
`(?P<name>a`,
`(?P<name>`,
`(?P<name`,
`(?P<x y>a)`,
`(?P<>a)`,
`(?<name>a`,
`(?<name>`,
`(?<name`,
`(?<x y>a)`,
`(?<>a)`,
`[a-Z]`,
`(?i)[a-Z]`,
`\Q\E*`,
`a{100000}`, // too much repetition
`a{100000,}`, // too much repetition
"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", // too much repetition
strings.Repeat("(", 1000) + strings.Repeat(")", 1000), // too deep
strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep
"(" + strings.Repeat("(xx?)", 1000) + "){1000}", // too long
strings.Repeat("(xx?){1000}", 1000), // too long
strings.Repeat(`\pL`, 27000), // too many runes
}
var onlyPerl = []string{
`[a-b-c]`,
`\Qabc\E`,
`\Q*+?{[\E`,
`\Q\\E`,
`\Q\\\E`,
`\Q\\\\E`,
`\Q\\\\\E`,
`(?:a)`,
`(?P<name>a)`,
}
var onlyPOSIX = []string{
"a++",
"a**",
"a?*",
"a+*",
"a{1}*",
".{1}{2}.{3}",
}
func TestParseInvalidRegexps(t *testing.T) {
for _, regexp := range invalidRegexps {
if re, err := Parse(regexp, Perl); err == nil {
t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
}
if re, err := Parse(regexp, POSIX); err == nil {
t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
}
}
for _, regexp := range onlyPerl {
if _, err := Parse(regexp, Perl); err != nil {
t.Errorf("Parse(%#q, Perl): %v", regexp, err)
}
if re, err := Parse(regexp, POSIX); err == nil {
t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
}
}
for _, regexp := range onlyPOSIX {
if re, err := Parse(regexp, Perl); err == nil {
t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
}
if _, err := Parse(regexp, POSIX); err != nil {
t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
}
}
}
func TestToStringEquivalentParse(t *testing.T) {
for _, tt := range parseTests {
re, err := Parse(tt.Regexp, testFlags)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
continue
}
if tt.Dump == "" {
// It parsed. That's all we care about.
continue
}
d := dump(re)
if d != tt.Dump {
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
continue
}
s := re.String()
if s != tt.Regexp {
// If ToString didn't return the original regexp,
// it must have found one with fewer parens.
// Unfortunately we can't check the length here, because
// ToString produces "\\{" for a literal brace,
// but "{" is a shorter equivalent in some contexts.
nre, err := Parse(s, testFlags)
if err != nil {
t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
continue
}
nd := dump(nre)
if d != nd {
t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
}
ns := nre.String()
if s != ns {
t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
}
}
}
}
var stringTests = []struct {
re string
out string
}{
{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
{`[Aa][Bb]`, `(?i:AB)`},
{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
}
func TestString(t *testing.T) {
for _, tt := range stringTests {
re, err := Parse(tt.re, Perl)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.re, err)
continue
}
out := re.String()
if out != tt.out {
t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
}
}
}

View File

@@ -0,0 +1,133 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by make_perl_groups.pl; DO NOT EDIT.
package syntax
var code1 = []rune{ /* \d */
0x30, 0x39,
}
var code2 = []rune{ /* \s */
0x9, 0xa,
0xc, 0xd,
0x20, 0x20,
}
var code3 = []rune{ /* \w */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var perlGroup = map[string]charGroup{
`\d`: {+1, code1},
`\D`: {-1, code1},
`\s`: {+1, code2},
`\S`: {-1, code2},
`\w`: {+1, code3},
`\W`: {-1, code3},
}
var code4 = []rune{ /* [:alnum:] */
0x30, 0x39,
0x41, 0x5a,
0x61, 0x7a,
}
var code5 = []rune{ /* [:alpha:] */
0x41, 0x5a,
0x61, 0x7a,
}
var code6 = []rune{ /* [:ascii:] */
0x0, 0x7f,
}
var code7 = []rune{ /* [:blank:] */
0x9, 0x9,
0x20, 0x20,
}
var code8 = []rune{ /* [:cntrl:] */
0x0, 0x1f,
0x7f, 0x7f,
}
var code9 = []rune{ /* [:digit:] */
0x30, 0x39,
}
var code10 = []rune{ /* [:graph:] */
0x21, 0x7e,
}
var code11 = []rune{ /* [:lower:] */
0x61, 0x7a,
}
var code12 = []rune{ /* [:print:] */
0x20, 0x7e,
}
var code13 = []rune{ /* [:punct:] */
0x21, 0x2f,
0x3a, 0x40,
0x5b, 0x60,
0x7b, 0x7e,
}
var code14 = []rune{ /* [:space:] */
0x9, 0xd,
0x20, 0x20,
}
var code15 = []rune{ /* [:upper:] */
0x41, 0x5a,
}
var code16 = []rune{ /* [:word:] */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var code17 = []rune{ /* [:xdigit:] */
0x30, 0x39,
0x41, 0x46,
0x61, 0x66,
}
var posixGroup = map[string]charGroup{
`[:alnum:]`: {+1, code4},
`[:^alnum:]`: {-1, code4},
`[:alpha:]`: {+1, code5},
`[:^alpha:]`: {-1, code5},
`[:ascii:]`: {+1, code6},
`[:^ascii:]`: {-1, code6},
`[:blank:]`: {+1, code7},
`[:^blank:]`: {-1, code7},
`[:cntrl:]`: {+1, code8},
`[:^cntrl:]`: {-1, code8},
`[:digit:]`: {+1, code9},
`[:^digit:]`: {-1, code9},
`[:graph:]`: {+1, code10},
`[:^graph:]`: {-1, code10},
`[:lower:]`: {+1, code11},
`[:^lower:]`: {-1, code11},
`[:print:]`: {+1, code12},
`[:^print:]`: {-1, code12},
`[:punct:]`: {+1, code13},
`[:^punct:]`: {-1, code13},
`[:space:]`: {+1, code14},
`[:^space:]`: {-1, code14},
`[:upper:]`: {+1, code15},
`[:^upper:]`: {-1, code15},
`[:word:]`: {+1, code16},
`[:^word:]`: {-1, code16},
`[:xdigit:]`: {+1, code17},
`[:^xdigit:]`: {-1, code17},
}

349
src/regexp/syntax/prog.go Normal file
View File

@@ -0,0 +1,349 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// Compiled program.
// May not belong in this package, but convenient for now.
// A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// An InstOp is an instruction opcode.
type InstOp uint8
const (
InstAlt InstOp = iota
InstAltMatch
InstCapture
InstEmptyWidth
InstMatch
InstFail
InstNop
InstRune
InstRune1
InstRuneAny
InstRuneAnyNotNL
)
var instOpNames = []string{
"InstAlt",
"InstAltMatch",
"InstCapture",
"InstEmptyWidth",
"InstMatch",
"InstFail",
"InstNop",
"InstRune",
"InstRune1",
"InstRuneAny",
"InstRuneAnyNotNL",
}
func (i InstOp) String() string {
if uint(i) >= uint(len(instOpNames)) {
return ""
}
return instOpNames[i]
}
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8
const (
EmptyBeginLine EmptyOp = 1 << iota
EmptyEndLine
EmptyBeginText
EmptyEndText
EmptyWordBoundary
EmptyNoWordBoundary
)
// EmptyOpContext returns the zero-width assertions
// satisfied at the position between the runes r1 and r2.
// Passing r1 == -1 indicates that the position is
// at the beginning of the text.
// Passing r2 == -1 indicates that the position is
// at the end of the text.
func EmptyOpContext(r1, r2 rune) EmptyOp {
var op EmptyOp = EmptyNoWordBoundary
var boundary byte
switch {
case IsWordChar(r1):
boundary = 1
case r1 == '\n':
op |= EmptyBeginLine
case r1 < 0:
op |= EmptyBeginText | EmptyBeginLine
}
switch {
case IsWordChar(r2):
boundary ^= 1
case r2 == '\n':
op |= EmptyEndLine
case r2 < 0:
op |= EmptyEndText | EmptyEndLine
}
if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
}
return op
}
// IsWordChar reports whether r is considered a “word character”
// during the evaluation of the \b and \B zero-width assertions.
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r rune) bool {
// Test for lowercase letters first, as these occur more
// frequently than uppercase letters in common cases.
return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_'
}
// An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []rune
}
func (p *Prog) String() string {
var b strings.Builder
dumpProg(&b, p)
return b.String()
}
// skipNop follows any no-op or capturing instructions.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
i = &p.Inst[i.Out]
}
return i
}
// op returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOp {
op := i.Op
switch op {
case InstRune1, InstRuneAny, InstRuneAnyNotNL:
op = InstRune
}
return op
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
i := p.skipNop(uint32(p.Start))
// Avoid allocation of buffer if prefix is empty.
if i.op() != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch
}
// Have prefix; gather characters.
var buf strings.Builder
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
buf.WriteRune(i.Rune[0])
i = p.skipNop(i.Out)
}
return buf.String(), i.Op == InstMatch
}
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
Loop:
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
default:
break Loop
}
pc = i.Out
i = &p.Inst[pc]
}
return flag
}
const noMatch = -1
// MatchRune reports whether the instruction matches (and consumes) r.
// It should only be called when i.Op == [InstRune].
func (i *Inst) MatchRune(r rune) bool {
return i.MatchRunePos(r) != noMatch
}
// MatchRunePos checks whether the instruction matches (and consumes) r.
// If so, MatchRunePos returns the index of the matching rune pair
// (or, when len(i.Rune) == 1, rune singleton).
// If not, MatchRunePos returns -1.
// MatchRunePos should only be called when i.Op == [InstRune].
func (i *Inst) MatchRunePos(r rune) int {
rune := i.Rune
switch len(rune) {
case 0:
return noMatch
case 1:
// Special case: single-rune slice is from literal string, not char class.
r0 := rune[0]
if r == r0 {
return 0
}
if Flags(i.Arg)&FoldCase != 0 {
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
if r == r1 {
return 0
}
}
}
return noMatch
case 2:
if r >= rune[0] && r <= rune[1] {
return 0
}
return noMatch
case 4, 6, 8:
// Linear search for a few pairs.
// Should handle ASCII well.
for j := 0; j < len(rune); j += 2 {
if r < rune[j] {
return noMatch
}
if r <= rune[j+1] {
return j / 2
}
}
return noMatch
}
// Otherwise binary search.
lo := 0
hi := len(rune) / 2
for lo < hi {
m := int(uint(lo+hi) >> 1)
if c := rune[2*m]; c <= r {
if r <= rune[2*m+1] {
return m
}
lo = m + 1
} else {
hi = m
}
}
return noMatch
}
// MatchEmptyWidth reports whether the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == [InstEmptyWidth].
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
switch EmptyOp(i.Arg) {
case EmptyBeginLine:
return before == '\n' || before == -1
case EmptyEndLine:
return after == '\n' || after == -1
case EmptyBeginText:
return before == -1
case EmptyEndText:
return after == -1
case EmptyWordBoundary:
return IsWordChar(before) != IsWordChar(after)
case EmptyNoWordBoundary:
return IsWordChar(before) == IsWordChar(after)
}
panic("unknown empty width arg")
}
func (i *Inst) String() string {
var b strings.Builder
dumpInst(&b, i)
return b.String()
}
func bw(b *strings.Builder, args ...string) {
for _, s := range args {
b.WriteString(s)
}
}
func dumpProg(b *strings.Builder, p *Prog) {
for j := range p.Inst {
i := &p.Inst[j]
pc := strconv.Itoa(j)
if len(pc) < 3 {
b.WriteString(" "[len(pc):])
}
if j == p.Start {
pc += "*"
}
bw(b, pc, "\t")
dumpInst(b, i)
bw(b, "\n")
}
}
func u32(i uint32) string {
return strconv.FormatUint(uint64(i), 10)
}
func dumpInst(b *strings.Builder, i *Inst) {
switch i.Op {
case InstAlt:
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
case InstAltMatch:
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
case InstCapture:
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
case InstEmptyWidth:
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
case InstMatch:
bw(b, "match")
case InstFail:
bw(b, "fail")
case InstNop:
bw(b, "nop -> ", u32(i.Out))
case InstRune:
if i.Rune == nil {
// shouldn't happen
bw(b, "rune <nil>")
}
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
if Flags(i.Arg)&FoldCase != 0 {
bw(b, "/i")
}
bw(b, " -> ", u32(i.Out))
case InstRune1:
bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
case InstRuneAny:
bw(b, "any -> ", u32(i.Out))
case InstRuneAnyNotNL:
bw(b, "anynotnl -> ", u32(i.Out))
}
}

View File

@@ -0,0 +1,144 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "testing"
var compileTests = []struct {
Regexp string
Prog string
}{
{"a", ` 0 fail
1* rune1 "a" -> 2
2 match
`},
{"[A-M][n-z]", ` 0 fail
1* rune "AM" -> 2
2 rune "nz" -> 3
3 match
`},
{"", ` 0 fail
1* nop -> 2
2 match
`},
{"a?", ` 0 fail
1 rune1 "a" -> 3
2* alt -> 1, 3
3 match
`},
{"a??", ` 0 fail
1 rune1 "a" -> 3
2* alt -> 3, 1
3 match
`},
{"a+", ` 0 fail
1* rune1 "a" -> 2
2 alt -> 1, 3
3 match
`},
{"a+?", ` 0 fail
1* rune1 "a" -> 2
2 alt -> 3, 1
3 match
`},
{"a*", ` 0 fail
1 rune1 "a" -> 2
2* alt -> 1, 3
3 match
`},
{"a*?", ` 0 fail
1 rune1 "a" -> 2
2* alt -> 3, 1
3 match
`},
{"a+b+", ` 0 fail
1* rune1 "a" -> 2
2 alt -> 1, 3
3 rune1 "b" -> 4
4 alt -> 3, 5
5 match
`},
{"(a+)(b+)", ` 0 fail
1* cap 2 -> 2
2 rune1 "a" -> 3
3 alt -> 2, 4
4 cap 3 -> 5
5 cap 4 -> 6
6 rune1 "b" -> 7
7 alt -> 6, 8
8 cap 5 -> 9
9 match
`},
{"a+|b+", ` 0 fail
1 rune1 "a" -> 2
2 alt -> 1, 6
3 rune1 "b" -> 4
4 alt -> 3, 6
5* alt -> 1, 3
6 match
`},
{"A[Aa]", ` 0 fail
1* rune1 "A" -> 2
2 rune "A"/i -> 3
3 match
`},
{"(?:(?:^).)", ` 0 fail
1* empty 4 -> 2
2 anynotnl -> 3
3 match
`},
{"(?:|a)+", ` 0 fail
1 nop -> 4
2 rune1 "a" -> 4
3* alt -> 1, 2
4 alt -> 3, 5
5 match
`},
{"(?:|a)*", ` 0 fail
1 nop -> 4
2 rune1 "a" -> 4
3 alt -> 1, 2
4 alt -> 3, 6
5* alt -> 3, 6
6 match
`},
}
func TestCompile(t *testing.T) {
for _, tt := range compileTests {
re, _ := Parse(tt.Regexp, Perl)
p, _ := Compile(re)
s := p.String()
if s != tt.Prog {
t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
}
}
}
func BenchmarkEmptyOpContext(b *testing.B) {
for i := 0; i < b.N; i++ {
var r1 rune = -1
for _, r2 := range "foo, bar, baz\nsome input text.\n" {
EmptyOpContext(r1, r2)
r1 = r2
}
EmptyOpContext(r1, -1)
}
}
var sink any
func BenchmarkIsWordChar(b *testing.B) {
const chars = "Don't communicate by sharing memory, share memory by communicating."
for i := 0; i < b.N; i++ {
for _, r := range chars {
sink = IsWordChar(r)
}
}
if sink == nil {
b.Fatal("Benchmark did not run")
}
sink = nil
}

464
src/regexp/syntax/regexp.go Normal file
View File

@@ -0,0 +1,464 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.
import (
"slices"
"strconv"
"strings"
"unicode"
)
// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op // operator
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []rune // matched runes, for OpLiteral, OpCharClass
Rune0 [2]rune // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
}
//go:generate stringer -type Op -trimprefix Op
// An Op is a single regular expression operator.
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
// Character class operators are listed simplest to most complex
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
const (
OpNoMatch Op = 1 + iota // matches no strings
OpEmptyMatch // matches empty string
OpLiteral // matches Runes sequence
OpCharClass // matches Runes interpreted as range pair list
OpAnyCharNotNL // matches any character except newline
OpAnyChar // matches any character
OpBeginLine // matches empty string at beginning of line
OpEndLine // matches empty string at end of line
OpBeginText // matches empty string at beginning of text
OpEndText // matches empty string at end of text
OpWordBoundary // matches word boundary `\b`
OpNoWordBoundary // matches word non-boundary `\B`
OpCapture // capturing subexpression with index Cap, optional name Name
OpStar // matches Sub[0] zero or more times
OpPlus // matches Sub[0] one or more times
OpQuest // matches Sub[0] zero or one times
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
OpConcat // matches concatenation of Subs
OpAlternate // matches alternation of Subs
)
const opPseudo Op = 128 // where pseudo-ops start
// Equal reports whether x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
if x == nil || y == nil {
return x == y
}
if x.Op != y.Op {
return false
}
switch x.Op {
case OpEndText:
// The parse flags remember whether this is \z or \Z.
if x.Flags&WasDollar != y.Flags&WasDollar {
return false
}
case OpLiteral, OpCharClass:
return slices.Equal(x.Rune, y.Rune)
case OpAlternate, OpConcat:
return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal)
case OpStar, OpPlus, OpQuest:
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpRepeat:
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpCapture:
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
}
return true
}
// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
type printFlags uint8
const (
flagI printFlags = 1 << iota // (?i:
flagM // (?m:
flagS // (?s:
flagOff // )
flagPrec // (?: )
negShift = 5 // flagI<<negShift is (?-i:
)
// addSpan enables the flags f around start..last,
// by setting flags[start] = f and flags[last] = flagOff.
func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
if *flags == nil {
*flags = make(map[*Regexp]printFlags)
}
(*flags)[start] = f
(*flags)[last] |= flagOff // maybe start==last
}
// calcFlags calculates the flags to print around each subexpression in re,
// storing that information in (*flags)[sub] for each affected subexpression.
// The first time an entry needs to be written to *flags, calcFlags allocates the map.
// calcFlags also calculates the flags that must be active or can't be active
// around re and returns those flags.
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
switch re.Op {
default:
return 0, 0
case OpLiteral:
// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
// according to whether (?i) is active.
// If literal is not fold-sensitive, return 0, 0.
for _, r := range re.Rune {
if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
if re.Flags&FoldCase != 0 {
return flagI, 0
} else {
return 0, flagI
}
}
}
return 0, 0
case OpCharClass:
// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
// If literal is not fold-sensitive, return 0, 0.
for i := 0; i < len(re.Rune); i += 2 {
lo := max(minFold, re.Rune[i])
hi := min(maxFold, re.Rune[i+1])
for r := lo; r <= hi; r++ {
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
return 0, flagI
}
}
}
}
return 0, 0
case OpAnyCharNotNL: // (?-s).
return 0, flagS
case OpAnyChar: // (?s).
return flagS, 0
case OpBeginLine, OpEndLine: // (?m)^ (?m)$
return flagM, 0
case OpEndText:
if re.Flags&WasDollar != 0 { // (?-m)$
return 0, flagM
}
return 0, 0
case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
return calcFlags(re.Sub[0], flags)
case OpConcat, OpAlternate:
// Gather the must and cant for each subexpression.
// When we find a conflicting subexpression, insert the necessary
// flags around the previously identified span and start over.
var must, cant, allCant printFlags
start := 0
last := 0
did := false
for i, sub := range re.Sub {
subMust, subCant := calcFlags(sub, flags)
if must&subCant != 0 || subMust&cant != 0 {
if must != 0 {
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
must = 0
cant = 0
start = i
did = true
}
must |= subMust
cant |= subCant
allCant |= subCant
if subMust != 0 {
last = i
}
if must == 0 && start == i {
start++
}
}
if !did {
// No conflicts: pass the accumulated must and cant upward.
return must, cant
}
if must != 0 {
// Conflicts found; need to finish final span.
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
return 0, allCant
}
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
f |= flags[re]
if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
// flagPrec is redundant with other flags being added and terminated
f &^= flagPrec
}
if f&^(flagOff|flagPrec) != 0 {
b.WriteString(`(?`)
if f&flagI != 0 {
b.WriteString(`i`)
}
if f&flagM != 0 {
b.WriteString(`m`)
}
if f&flagS != 0 {
b.WriteString(`s`)
}
if f&((flagM|flagS)<<negShift) != 0 {
b.WriteString(`-`)
if f&(flagM<<negShift) != 0 {
b.WriteString(`m`)
}
if f&(flagS<<negShift) != 0 {
b.WriteString(`s`)
}
}
b.WriteString(`:`)
}
if f&flagOff != 0 {
defer b.WriteString(`)`)
}
if f&flagPrec != 0 {
b.WriteString(`(?:`)
defer b.WriteString(`)`)
}
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
case OpNoMatch:
b.WriteString(`[^\x00-\x{10FFFF}]`)
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
for _, r := range re.Rune {
escape(b, r, false)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) == 0 {
b.WriteString(`^\x00-\x{10FFFF}`)
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
for i := 1; i < len(re.Rune)-1; i += 2 {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
} else {
for i := 0; i < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL, OpAnyChar:
b.WriteString(`.`)
case OpBeginLine:
b.WriteString(`^`)
case OpEndLine:
b.WriteString(`$`)
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
if re.Flags&WasDollar != 0 {
b.WriteString(`$`)
} else {
b.WriteString(`\z`)
}
case OpWordBoundary:
b.WriteString(`\b`)
case OpNoWordBoundary:
b.WriteString(`\B`)
case OpCapture:
if re.Name != "" {
b.WriteString(`(?P<`)
b.WriteString(re.Name)
b.WriteRune('>')
} else {
b.WriteRune('(')
}
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
p := printFlags(0)
sub := re.Sub[0]
if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
switch re.Op {
case OpStar:
b.WriteRune('*')
case OpPlus:
b.WriteRune('+')
case OpQuest:
b.WriteRune('?')
case OpRepeat:
b.WriteRune('{')
b.WriteString(strconv.Itoa(re.Min))
if re.Max != re.Min {
b.WriteRune(',')
if re.Max >= 0 {
b.WriteString(strconv.Itoa(re.Max))
}
}
b.WriteRune('}')
}
if re.Flags&NonGreedy != 0 {
b.WriteRune('?')
}
case OpConcat:
for _, sub := range re.Sub {
p := printFlags(0)
if sub.Op == OpAlternate {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub, 0, flags)
}
}
}
func (re *Regexp) String() string {
var b strings.Builder
var flags map[*Regexp]printFlags
must, cant := calcFlags(re, &flags)
must |= (cant &^ flagI) << negShift
if must != 0 {
must |= flagOff
}
writeRegexp(&b, re, must, flags)
return b.String()
}
const meta = `\.+*?()|[]{}^$`
func escape(b *strings.Builder, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.ContainsRune(meta, r) || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.FormatInt(int64(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.FormatInt(int64(r), 16))
b.WriteString(`}`)
}
}
// MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int {
m := 0
if re.Op == OpCapture {
m = re.Cap
}
for _, sub := range re.Sub {
if n := sub.MaxCap(); m < n {
m = n
}
}
return m
}
// CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string {
names := make([]string, re.MaxCap()+1)
re.capNames(names)
return names
}
func (re *Regexp) capNames(names []string) {
if re.Op == OpCapture {
names[re.Cap] = re.Name
}
for _, sub := range re.Sub {
sub.capNames(names)
}
}

View File

@@ -0,0 +1,151 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Simplify returns a regexp equivalent to re but without counted repetitions
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
// The resulting regexp will execute correctly but its string representation
// will not produce the same parse tree, because capturing parentheses
// may have been duplicated or removed. For example, the simplified form
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
// The returned regexp may share structure with or be the original.
func (re *Regexp) Simplify() *Regexp {
if re == nil {
return nil
}
switch re.Op {
case OpCapture, OpConcat, OpAlternate:
// Simplify children, building new Regexp if children change.
nre := re
for i, sub := range re.Sub {
nsub := sub.Simplify()
if nre == re && nsub != sub {
// Start a copy.
nre = new(Regexp)
*nre = *re
nre.Rune = nil
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
}
if nre != re {
nre.Sub = append(nre.Sub, nsub)
}
}
return nre
case OpStar, OpPlus, OpQuest:
sub := re.Sub[0].Simplify()
return simplify1(re.Op, re.Flags, sub, re)
case OpRepeat:
// Special special case: x{0} matches the empty string
// and doesn't even need to consider x.
if re.Min == 0 && re.Max == 0 {
return &Regexp{Op: OpEmptyMatch}
}
// The fun begins.
sub := re.Sub[0].Simplify()
// x{n,} means at least n matches of x.
if re.Max == -1 {
// Special case: x{0,} is x*.
if re.Min == 0 {
return simplify1(OpStar, re.Flags, sub, nil)
}
// Special case: x{1,} is x+.
if re.Min == 1 {
return simplify1(OpPlus, re.Flags, sub, nil)
}
// General case: x{4,} is xxxx+.
nre := &Regexp{Op: OpConcat}
nre.Sub = nre.Sub0[:0]
for i := 0; i < re.Min-1; i++ {
nre.Sub = append(nre.Sub, sub)
}
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
return nre
}
// Special case x{0} handled above.
// Special case: x{1} is just x.
if re.Min == 1 && re.Max == 1 {
return sub
}
// General case: x{n,m} means n copies of x and m copies of x?
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx.
var prefix *Regexp
if re.Min > 0 {
prefix = &Regexp{Op: OpConcat}
prefix.Sub = prefix.Sub0[:0]
for i := 0; i < re.Min; i++ {
prefix.Sub = append(prefix.Sub, sub)
}
}
// Build and attach suffix: (x(x(x)?)?)?
if re.Max > re.Min {
suffix := simplify1(OpQuest, re.Flags, sub, nil)
for i := re.Min + 1; i < re.Max; i++ {
nre2 := &Regexp{Op: OpConcat}
nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
suffix = simplify1(OpQuest, re.Flags, nre2, nil)
}
if prefix == nil {
return suffix
}
prefix.Sub = append(prefix.Sub, suffix)
}
if prefix != nil {
return prefix
}
// Some degenerate case like min > max or min < max < 0.
// Handle as impossible match.
return &Regexp{Op: OpNoMatch}
}
return re
}
// simplify1 implements Simplify for the unary OpStar,
// OpPlus, and OpQuest operators. It returns the simple regexp
// equivalent to
//
// Regexp{Op: op, Flags: flags, Sub: {sub}}
//
// under the assumption that sub is already simple, and
// without first allocating that structure. If the regexp
// to be returned turns out to be equivalent to re, simplify1
// returns re instead.
//
// simplify1 is factored out of Simplify because the implementation
// for other operators generates these unary expressions.
// Letting them call simplify1 makes sure the expressions they
// generate are simple.
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if sub.Op == OpEmptyMatch {
return sub
}
// The operators are idempotent if the flags match.
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
return sub
}
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
return re
}
re = &Regexp{Op: op, Flags: flags}
re.Sub = append(re.Sub0[:0], sub)
return re
}

View File

@@ -0,0 +1,153 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "testing"
var simplifyTests = []struct {
Regexp string
Simple string
}{
// Already-simple constructs
{`a`, `a`},
{`ab`, `ab`},
{`a|b`, `[ab]`},
{`ab|cd`, `ab|cd`},
{`(ab)*`, `(ab)*`},
{`(ab)+`, `(ab)+`},
{`(ab)?`, `(ab)?`},
{`.`, `(?s:.)`},
{`^`, `(?m:^)`},
{`$`, `(?m:$)`},
{`[ac]`, `[ac]`},
{`[^ac]`, `[^ac]`},
// Posix character classes
{`[[:alnum:]]`, `[0-9A-Za-z]`},
{`[[:alpha:]]`, `[A-Za-z]`},
{`[[:blank:]]`, `[\t ]`},
{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
{`[[:digit:]]`, `[0-9]`},
{`[[:graph:]]`, `[!-~]`},
{`[[:lower:]]`, `[a-z]`},
{`[[:print:]]`, `[ -~]`},
{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
{`[[:space:]]`, `[\t-\r ]`},
{`[[:upper:]]`, `[A-Z]`},
{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
// Perl character classes
{`\d`, `[0-9]`},
{`\s`, `[\t\n\f\r ]`},
{`\w`, `[0-9A-Z_a-z]`},
{`\D`, `[^0-9]`},
{`\S`, `[^\t\n\f\r ]`},
{`\W`, `[^0-9A-Z_a-z]`},
{`[\d]`, `[0-9]`},
{`[\s]`, `[\t\n\f\r ]`},
{`[\w]`, `[0-9A-Z_a-z]`},
{`[\D]`, `[^0-9]`},
{`[\S]`, `[^\t\n\f\r ]`},
{`[\W]`, `[^0-9A-Z_a-z]`},
// Posix repetitions
{`a{1}`, `a`},
{`a{2}`, `aa`},
{`a{5}`, `aaaaa`},
{`a{0,1}`, `a?`},
// The next three are illegible because Simplify inserts (?:)
// parens instead of () parens to avoid creating extra
// captured subexpressions. The comments show a version with fewer parens.
{`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
{`a{0,2}`, `(?:aa?)?`}, // (aa?)?
{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
{`a{0,}`, `a*`},
{`a{1,}`, `a+`},
{`a{2,}`, `aa+`},
{`a{5,}`, `aaaaa+`},
// Test that operators simplify their arguments.
{`(?:a{1,}){1,}`, `a+`},
{`(a{1,}b{1,})`, `(a+b+)`},
{`a{1,}|b{1,}`, `a+|b+`},
{`(?:a{1,})*`, `(?:a+)*`},
{`(?:a{1,})+`, `a+`},
{`(?:a{1,})?`, `(?:a+)?`},
{``, `(?:)`},
{`a{0}`, `(?:)`},
// Character class simplification
{`[ab]`, `[ab]`},
{`[abc]`, `[a-c]`},
{`[a-za-za-z]`, `[a-z]`},
{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
{`[ABCDEFGH]`, `[A-H]`},
{`[AB-CD-EF-GH]`, `[A-H]`},
{`[W-ZP-XE-R]`, `[E-Z]`},
{`[a-ee-gg-m]`, `[a-m]`},
{`[a-ea-ha-m]`, `[a-m]`},
{`[a-ma-ha-e]`, `[a-m]`},
{`[a-zA-Z0-9 -~]`, `[ -~]`},
// Empty character classes
{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
// Full character classes
{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
// Unicode case folding.
{`(?i)A`, `(?i:A)`},
{`(?i)a`, `(?i:A)`},
{`(?i)[A]`, `(?i:A)`},
{`(?i)[a]`, `(?i:A)`},
{`(?i)K`, `(?i:K)`},
{`(?i)k`, `(?i:K)`},
{`(?i)\x{212a}`, "(?i:K)"},
{`(?i)[K]`, "[Kk\u212A]"},
{`(?i)[k]`, "[Kk\u212A]"},
{`(?i)[\x{212a}]`, "[Kk\u212A]"},
{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
// Empty string as a regular expression.
// The empty string must be preserved inside parens in order
// to make submatches work right, so these tests are less
// interesting than they might otherwise be. String inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{`(a|b|c|)`, `([a-c]|(?:))`},
{`(a|b|)`, `([ab]|(?:))`},
{`(|)`, `()`},
{`a()`, `a()`},
{`(()|())`, `(()|())`},
{`(a|)`, `(a|(?:))`},
{`ab()cd()`, `ab()cd()`},
{`()`, `()`},
{`()*`, `()*`},
{`()+`, `()+`},
{`()?`, `()?`},
{`(){0}`, `(?:)`},
{`(){1}`, `()`},
{`(){1,}`, `()+`},
{`(){0,2}`, `(?:()()?)?`},
}
func TestSimplify(t *testing.T) {
for _, tt := range simplifyTests {
re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
if err != nil {
t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
continue
}
s := re.Simplify().String()
if s != tt.Simple {
t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
}
}
}

24
src/regexp/testdata/README vendored Normal file
View File

@@ -0,0 +1,24 @@
AT&T POSIX Test Files
See textregex.c for copyright + license.
testregex.c http://www2.research.att.com/~gsf/testregex/testregex.c
basic.dat http://www2.research.att.com/~gsf/testregex/basic.dat
nullsubexpr.dat http://www2.research.att.com/~gsf/testregex/nullsubexpr.dat
repetition.dat http://www2.research.att.com/~gsf/testregex/repetition.dat
The test data has been edited to reflect RE2/Go differences:
* In a star of a possibly empty match like (a*)* matching x,
the no match case runs the starred subexpression zero times,
not once. This is consistent with (a*)* matching a, which
runs the starred subexpression one time, not twice.
* The submatch choice is first match, not the POSIX rule.
Such changes are marked with 'RE2/Go'.
RE2 Test Files
re2-exhaustive.txt.bz2 and re2-search.txt are built by running
'make log' in the RE2 distribution https://github.com/google/re2/
The exhaustive file is compressed because it is huge.

217
src/regexp/testdata/basic.dat vendored Normal file
View File

@@ -0,0 +1,217 @@
NOTE all standard compliant implementations should pass these : 2002-05-31
BE abracadabra$ abracadabracadabra (7,18)
BE a...b abababbb (2,7)
BE XXXXXX ..XXXXXX (2,8)
E \) () (1,2)
BE a] a]a (0,2)
B } } (0,1)
E \} } (0,1)
BE \] ] (0,1)
B ] ] (0,1)
E ] ] (0,1)
B { { (0,1)
B } } (0,1)
BE ^a ax (0,1)
BE \^a a^a (1,3)
BE a\^ a^ (0,2)
BE a$ aa (1,2)
BE a\$ a$ (0,2)
BE ^$ NULL (0,0)
E $^ NULL (0,0)
E a($) aa (1,2)(2,2)
E a*(^a) aa (0,1)(0,1)
E (..)*(...)* a (0,0)
E (..)*(...)* abcd (0,4)(2,4)
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
E (ab)c|abc abc (0,3)(0,2)
E a{0}b ab (1,2)
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E a{9876543210} NULL BADBR
E ((a|a)|a) a (0,1)(0,1)(0,1)
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
E a*(a.|aa) aaaa (0,4)(2,4)
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
E (a|b)?.* b (0,1)(0,1)
E (a|b)c|a(b|c) ac (0,2)(0,1)
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
E (a|b)*c|(a|ab)*c xc (1,2)
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
E a?(ab|ba)ab abab (0,4)(0,2)
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
E ab|abab abbabab (0,2)
E aba|bab|bba baaabbbaba (5,8)
E aba|bab baaabbbaba (6,9)
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
E ab|a xabc (1,3)
E ab|a xxabc (2,4)
Ei (Ab|cD)* aBcD (0,4)(2,4)
BE [^-] --a (2,3)
BE [a-]* --a (0,3)
BE [a-m-]* --amoma-- (0,4)
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
{E [[:upper:]] A (0,1) [[<element>]] not supported
E [[:lower:]]+ `az{ (1,3)
E [[:upper:]]+ @AZ[ (1,3)
# No collation in Go
#BE [[-]] [[-]] (2,4)
#BE [[.NIL.]] NULL ECOLLATE
#BE [[=aleph=]] NULL ECOLLATE
}
BE$ \n \n (0,1)
BEn$ \n \n (0,1)
BE$ [^a] \n (0,1)
BE$ \na \na (0,2)
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
BE xxx xxx (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
BE$ .* \x01\xff (0,2)
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
E a*a*a*a*a*b aaaaaaaaab (0,10)
BE ^ NULL (0,0)
BE $ NULL (0,0)
BE ^$ NULL (0,0)
BE ^a$ a (0,1)
BE abc abc (0,3)
BE abc xabcy (1,4)
BE abc ababc (2,5)
BE ab*c abc (0,3)
BE ab*bc abc (0,3)
BE ab*bc abbc (0,4)
BE ab*bc abbbbc (0,6)
E ab+bc abbc (0,4)
E ab+bc abbbbc (0,6)
E ab?bc abbc (0,4)
E ab?bc abc (0,3)
E ab?c abc (0,3)
BE ^abc$ abc (0,3)
BE ^abc abcc (0,3)
BE abc$ aabc (1,4)
BE ^ abc (0,0)
BE $ abc (3,3)
BE a.c abc (0,3)
BE a.c axc (0,3)
BE a.*c axyzc (0,5)
BE a[bc]d abd (0,3)
BE a[b-d]e ace (0,3)
BE a[b-d] aac (1,3)
BE a[-b] a- (0,2)
BE a[b-] a- (0,2)
BE a] a] (0,2)
BE a[]]b a]b (0,3)
BE a[^bc]d aed (0,3)
BE a[^-b]c adc (0,3)
BE a[^]b]c adc (0,3)
E ab|cd abc (0,2)
E ab|cd abcd (0,2)
E a\(b a(b (0,3)
E a\(*b ab (0,2)
E a\(*b a((b (0,4)
E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
E (a*)* - (0,0)(0,0)
E (a*)+ - (0,0)(0,0)
E (a*|b)* - (0,0)(0,0)
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
E (^)* - (0,0)(0,0)
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
E ((a*|b))* - (0,0)(0,0)(0,0)
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
E (ab|cd)e abcde (2,5)(2,4)
BE [abhgefdc]ij hij (0,3)
E (a|b)c*d abcd (1,4)(1,2)
E (ab|ab*)bc abc (0,3)(0,1)
E a([bc]*)c* abc (0,3)(1,3)
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
E a[bcd]*dcdcde adcdcde (0,7)
E (ab|a)b*c abc (0,3)(0,2)
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
E ^a(bc+|b[eh])g|.h$ abh (1,3)
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
BE multiple words multiple words yeah (0,14)
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
BE abcd abcd (0,4)
E a(bc)d abcd (0,4)(1,3)
E a[-]?c ac (0,3)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
E a+(b|c)*d+ aabcdd (0,6)(3,4)
E ^.+$ vivi (0,4)
E ^(.+)$ vivi (0,4)(0,4)
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
E (foo|(bar))!bas foo!bas (0,7)(0,3)
E (foo|bar)!bas bar!bas (0,7)(0,3)
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
E (foo|bar)!bas foo!bas (0,7)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
E .*(/XXX).* /XXX (0,4)(0,4)
E .*(\\XXX).* \XXX (0,4)(0,4)
E \\XXX \XXX (0,4)
E .*(/000).* /000 (0,4)(0,4)
E .*(\\000).* \000 (0,4)(0,4)
E \\000 \000 (0,4)

73
src/regexp/testdata/nullsubexpr.dat vendored Normal file
View File

@@ -0,0 +1,73 @@
NOTE null subexpression matches : 2002-06-06
E (a*)* a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)* a (0,1)(0,1)
E SAME x (0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)+ a (0,1)(0,1)
E SAME x NOMATCH
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)* a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
E SAME b (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
E SAME aaaaaa (0,6)(0,6)
E SAME ababab (0,6)(0,6)
E SAME bababa (0,6)(0,6)
E SAME b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaaaa (0,0)(0,0)
E ([^ab]*)* ccccxx (0,6)(0,6)
E SAME ababab (0,0)(0,0)
E ((z)+|a)* zabcde (0,2)(1,2)
#{E a+? aaaaaa (0,1) no *? +? minimal match ops
#E (a) aaa (0,1)(0,1)
#E (a*?) aaa (0,0)(0,0)
#E (a)*? aaa (0,0)
#E (a*?)*? aaa (0,0)
#}
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)
E (a*)+(x) x (0,1)(0,0)(0,1)
E (a*)+(x) ax (0,2)(0,1)(1,2)
E (a*)+(x) axa (0,2)(0,1)(1,2)
E (a*){2}(x) x (0,1)(0,0)(0,1)
E (a*){2}(x) ax (0,2)(1,1)(1,2)
E (a*){2}(x) axa (0,2)(1,1)(1,2)

Binary file not shown.

3779
src/regexp/testdata/re2-search.txt vendored Normal file

File diff suppressed because it is too large Load Diff

163
src/regexp/testdata/repetition.dat vendored Normal file
View File

@@ -0,0 +1,163 @@
NOTE implicit vs. explicit repetitions : 2009-02-02
# Glenn Fowler <gsf@research.att.com>
# conforming matches (column 4) must match one of the following BREs
# NOMATCH
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
# i.e., each 3-tuple has two identical elements and one (?,?)
E ((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.)){1} NULL NOMATCH
E ((..)|(.)){2} NULL NOMATCH
E ((..)|(.)){3} NULL NOMATCH
E ((..)|(.))* NULL (0,0)
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)){2} a NOMATCH
E ((..)|(.)){3} a NOMATCH
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
E ((..)|(.)){3} aa NOMATCH
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
# Linux/GLIBC gets the {8,} and {8,8} wrong.
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
# These test a fixed bug in my regex-tdfa that did not keep the expanded
# form properly grouped, so right association did the wrong thing with
# these ambiguous patterns (crafted just to test my code when I became
# suspicious of my implementation). The first subexpression should use
# "ab" then "a" then "bcd".
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
# results like (0,6)(4,5)(6,6).
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
# The above worked on Linux/GLIBC but the following often fail.
# They also trip up OS X / FreeBSD / NetBSD:
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go

2286
src/regexp/testdata/testregex.c vendored Normal file

File diff suppressed because it is too large Load Diff