Initial commit: Go 1.23 release state

This commit is contained in:
Vorapol Rinsatitnon
2024-09-21 23:49:08 +10:00
commit 17cd57a668
13231 changed files with 3114330 additions and 0 deletions

20
src/unicode/casetables.go Normal file
View File

@@ -0,0 +1,20 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// TODO: This file contains the special casing rules for Turkish and Azeri only.
// It should encompass all the languages with special casing rules
// and be generated automatically, but that requires some API
// development first.
package unicode
var TurkishCase SpecialCase = _TurkishCase
var _TurkishCase = SpecialCase{
CaseRange{0x0049, 0x0049, d{0, 0x131 - 0x49, 0}},
CaseRange{0x0069, 0x0069, d{0x130 - 0x69, 0, 0x130 - 0x69}},
CaseRange{0x0130, 0x0130, d{0, 0x69 - 0x130, 0}},
CaseRange{0x0131, 0x0131, d{0x49 - 0x131, 0, 0x49 - 0x131}},
}
var AzeriCase SpecialCase = _TurkishCase

13
src/unicode/digit.go Normal file
View File

@@ -0,0 +1,13 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode
// IsDigit reports whether the rune is a decimal digit.
func IsDigit(r rune) bool {
if r <= MaxLatin1 {
return '0' <= r && r <= '9'
}
return isExcludingLatin(Digit, r)
}

126
src/unicode/digit_test.go Normal file
View File

@@ -0,0 +1,126 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode_test
import (
"testing"
. "unicode"
)
var testDigit = []rune{
0x0030,
0x0039,
0x0661,
0x06F1,
0x07C9,
0x0966,
0x09EF,
0x0A66,
0x0AEF,
0x0B66,
0x0B6F,
0x0BE6,
0x0BEF,
0x0C66,
0x0CEF,
0x0D66,
0x0D6F,
0x0E50,
0x0E59,
0x0ED0,
0x0ED9,
0x0F20,
0x0F29,
0x1040,
0x1049,
0x1090,
0x1091,
0x1099,
0x17E0,
0x17E9,
0x1810,
0x1819,
0x1946,
0x194F,
0x19D0,
0x19D9,
0x1B50,
0x1B59,
0x1BB0,
0x1BB9,
0x1C40,
0x1C49,
0x1C50,
0x1C59,
0xA620,
0xA629,
0xA8D0,
0xA8D9,
0xA900,
0xA909,
0xAA50,
0xAA59,
0xFF10,
0xFF19,
0x104A1,
0x1D7CE,
}
var testLetter = []rune{
0x0041,
0x0061,
0x00AA,
0x00BA,
0x00C8,
0x00DB,
0x00F9,
0x02EC,
0x0535,
0x06E6,
0x093D,
0x0A15,
0x0B99,
0x0DC0,
0x0EDD,
0x1000,
0x1200,
0x1312,
0x1401,
0x1885,
0x2C00,
0xA800,
0xF900,
0xFA30,
0xFFDA,
0xFFDC,
0x10000,
0x10300,
0x10400,
0x20000,
0x2F800,
0x2FA1D,
}
func TestDigit(t *testing.T) {
for _, r := range testDigit {
if !IsDigit(r) {
t.Errorf("IsDigit(U+%04X) = false, want true", r)
}
}
for _, r := range testLetter {
if IsDigit(r) {
t.Errorf("IsDigit(U+%04X) = true, want false", r)
}
}
}
// Test that the special case in IsDigit agrees with the table
func TestDigitOptimization(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
if Is(Digit, i) != IsDigit(i) {
t.Errorf("IsDigit(U+%04X) disagrees with Is(Digit)", i)
}
}
}

256
src/unicode/example_test.go Normal file
View File

@@ -0,0 +1,256 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode_test
import (
"fmt"
"unicode"
)
// Functions starting with "Is" can be used to inspect which table of range a
// rune belongs to. Note that runes may fit into more than one range.
func Example_is() {
// constant with mixed type runes
const mixed = "\b5Ὂg̀9! ℃ᾭG"
for _, c := range mixed {
fmt.Printf("For %q:\n", c)
if unicode.IsControl(c) {
fmt.Println("\tis control rune")
}
if unicode.IsDigit(c) {
fmt.Println("\tis digit rune")
}
if unicode.IsGraphic(c) {
fmt.Println("\tis graphic rune")
}
if unicode.IsLetter(c) {
fmt.Println("\tis letter rune")
}
if unicode.IsLower(c) {
fmt.Println("\tis lower case rune")
}
if unicode.IsMark(c) {
fmt.Println("\tis mark rune")
}
if unicode.IsNumber(c) {
fmt.Println("\tis number rune")
}
if unicode.IsPrint(c) {
fmt.Println("\tis printable rune")
}
if !unicode.IsPrint(c) {
fmt.Println("\tis not printable rune")
}
if unicode.IsPunct(c) {
fmt.Println("\tis punct rune")
}
if unicode.IsSpace(c) {
fmt.Println("\tis space rune")
}
if unicode.IsSymbol(c) {
fmt.Println("\tis symbol rune")
}
if unicode.IsTitle(c) {
fmt.Println("\tis title case rune")
}
if unicode.IsUpper(c) {
fmt.Println("\tis upper case rune")
}
}
// Output:
// For '\b':
// is control rune
// is not printable rune
// For '5':
// is digit rune
// is graphic rune
// is number rune
// is printable rune
// For 'Ὂ':
// is graphic rune
// is letter rune
// is printable rune
// is upper case rune
// For 'g':
// is graphic rune
// is letter rune
// is lower case rune
// is printable rune
// For '̀':
// is graphic rune
// is mark rune
// is printable rune
// For '9':
// is digit rune
// is graphic rune
// is number rune
// is printable rune
// For '!':
// is graphic rune
// is printable rune
// is punct rune
// For ' ':
// is graphic rune
// is printable rune
// is space rune
// For '℃':
// is graphic rune
// is printable rune
// is symbol rune
// For 'ᾭ':
// is graphic rune
// is letter rune
// is printable rune
// is title case rune
// For 'G':
// is graphic rune
// is letter rune
// is printable rune
// is upper case rune
}
func ExampleSimpleFold() {
fmt.Printf("%#U\n", unicode.SimpleFold('A')) // 'a'
fmt.Printf("%#U\n", unicode.SimpleFold('a')) // 'A'
fmt.Printf("%#U\n", unicode.SimpleFold('K')) // 'k'
fmt.Printf("%#U\n", unicode.SimpleFold('k')) // '\u212A' (Kelvin symbol, )
fmt.Printf("%#U\n", unicode.SimpleFold('\u212A')) // 'K'
fmt.Printf("%#U\n", unicode.SimpleFold('1')) // '1'
// Output:
// U+0061 'a'
// U+0041 'A'
// U+006B 'k'
// U+212A ''
// U+004B 'K'
// U+0031 '1'
}
func ExampleTo() {
const lcG = 'g'
fmt.Printf("%#U\n", unicode.To(unicode.UpperCase, lcG))
fmt.Printf("%#U\n", unicode.To(unicode.LowerCase, lcG))
fmt.Printf("%#U\n", unicode.To(unicode.TitleCase, lcG))
const ucG = 'G'
fmt.Printf("%#U\n", unicode.To(unicode.UpperCase, ucG))
fmt.Printf("%#U\n", unicode.To(unicode.LowerCase, ucG))
fmt.Printf("%#U\n", unicode.To(unicode.TitleCase, ucG))
// Output:
// U+0047 'G'
// U+0067 'g'
// U+0047 'G'
// U+0047 'G'
// U+0067 'g'
// U+0047 'G'
}
func ExampleToLower() {
const ucG = 'G'
fmt.Printf("%#U\n", unicode.ToLower(ucG))
// Output:
// U+0067 'g'
}
func ExampleToTitle() {
const ucG = 'g'
fmt.Printf("%#U\n", unicode.ToTitle(ucG))
// Output:
// U+0047 'G'
}
func ExampleToUpper() {
const ucG = 'g'
fmt.Printf("%#U\n", unicode.ToUpper(ucG))
// Output:
// U+0047 'G'
}
func ExampleSpecialCase() {
t := unicode.TurkishCase
const lci = 'i'
fmt.Printf("%#U\n", t.ToLower(lci))
fmt.Printf("%#U\n", t.ToTitle(lci))
fmt.Printf("%#U\n", t.ToUpper(lci))
const uci = 'İ'
fmt.Printf("%#U\n", t.ToLower(uci))
fmt.Printf("%#U\n", t.ToTitle(uci))
fmt.Printf("%#U\n", t.ToUpper(uci))
// Output:
// U+0069 'i'
// U+0130 'İ'
// U+0130 'İ'
// U+0069 'i'
// U+0130 'İ'
// U+0130 'İ'
}
func ExampleIsDigit() {
fmt.Printf("%t\n", unicode.IsDigit('৩'))
fmt.Printf("%t\n", unicode.IsDigit('A'))
// Output:
// true
// false
}
func ExampleIsNumber() {
fmt.Printf("%t\n", unicode.IsNumber('Ⅷ'))
fmt.Printf("%t\n", unicode.IsNumber('A'))
// Output:
// true
// false
}
func ExampleIsLetter() {
fmt.Printf("%t\n", unicode.IsLetter('A'))
fmt.Printf("%t\n", unicode.IsLetter('7'))
// Output:
// true
// false
}
func ExampleIsLower() {
fmt.Printf("%t\n", unicode.IsLower('a'))
fmt.Printf("%t\n", unicode.IsLower('A'))
// Output:
// true
// false
}
func ExampleIsUpper() {
fmt.Printf("%t\n", unicode.IsUpper('A'))
fmt.Printf("%t\n", unicode.IsUpper('a'))
// Output:
// true
// false
}
func ExampleIsTitle() {
fmt.Printf("%t\n", unicode.IsTitle('Dž'))
fmt.Printf("%t\n", unicode.IsTitle('a'))
// Output:
// true
// false
}
func ExampleIsSpace() {
fmt.Printf("%t\n", unicode.IsSpace(' '))
fmt.Printf("%t\n", unicode.IsSpace('\n'))
fmt.Printf("%t\n", unicode.IsSpace('\t'))
fmt.Printf("%t\n", unicode.IsSpace('a'))
// Output:
// true
// true
// true
// false
}

146
src/unicode/graphic.go Normal file
View File

@@ -0,0 +1,146 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode
// Bit masks for each code point under U+0100, for fast lookup.
const (
pC = 1 << iota // a control character.
pP // a punctuation character.
pN // a numeral.
pS // a symbolic character.
pZ // a spacing character.
pLu // an upper-case letter.
pLl // a lower-case letter.
pp // a printable character according to Go's definition.
pg = pp | pZ // a graphical character according to the Unicode definition.
pLo = pLl | pLu // a letter that is neither upper nor lower case.
pLmask = pLo
)
// GraphicRanges defines the set of graphic characters according to Unicode.
var GraphicRanges = []*RangeTable{
L, M, N, P, S, Zs,
}
// PrintRanges defines the set of printable characters according to Go.
// ASCII space, U+0020, is handled separately.
var PrintRanges = []*RangeTable{
L, M, N, P, S,
}
// IsGraphic reports whether the rune is defined as a Graphic by Unicode.
// Such characters include letters, marks, numbers, punctuation, symbols, and
// spaces, from categories [L], [M], [N], [P], [S], [Zs].
func IsGraphic(r rune) bool {
// We convert to uint32 to avoid the extra test for negative,
// and in the index we convert to uint8 to avoid the range check.
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pg != 0
}
return In(r, GraphicRanges...)
}
// IsPrint reports whether the rune is defined as printable by Go. Such
// characters include letters, marks, numbers, punctuation, symbols, and the
// ASCII space character, from categories [L], [M], [N], [P], [S] and the ASCII space
// character. This categorization is the same as [IsGraphic] except that the
// only spacing character is ASCII space, U+0020.
func IsPrint(r rune) bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pp != 0
}
return In(r, PrintRanges...)
}
// IsOneOf reports whether the rune is a member of one of the ranges.
// The function "In" provides a nicer signature and should be used in preference to IsOneOf.
func IsOneOf(ranges []*RangeTable, r rune) bool {
for _, inside := range ranges {
if Is(inside, r) {
return true
}
}
return false
}
// In reports whether the rune is a member of one of the ranges.
func In(r rune, ranges ...*RangeTable) bool {
for _, inside := range ranges {
if Is(inside, r) {
return true
}
}
return false
}
// IsControl reports whether the rune is a control character.
// The [C] ([Other]) Unicode category includes more code points
// such as surrogates; use [Is](C, r) to test for them.
func IsControl(r rune) bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pC != 0
}
// All control characters are < MaxLatin1.
return false
}
// IsLetter reports whether the rune is a letter (category [L]).
func IsLetter(r rune) bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&(pLmask) != 0
}
return isExcludingLatin(Letter, r)
}
// IsMark reports whether the rune is a mark character (category [M]).
func IsMark(r rune) bool {
// There are no mark characters in Latin-1.
return isExcludingLatin(Mark, r)
}
// IsNumber reports whether the rune is a number (category [N]).
func IsNumber(r rune) bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pN != 0
}
return isExcludingLatin(Number, r)
}
// IsPunct reports whether the rune is a Unicode punctuation character
// (category [P]).
func IsPunct(r rune) bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pP != 0
}
return Is(Punct, r)
}
// IsSpace reports whether the rune is a space character as defined
// by Unicode's White Space property; in the Latin-1 space
// this is
//
// '\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP).
//
// Other definitions of spacing characters are set by category
// Z and property [Pattern_White_Space].
func IsSpace(r rune) bool {
// This property isn't the same as Z; special-case it.
if uint32(r) <= MaxLatin1 {
switch r {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
return true
}
return false
}
return isExcludingLatin(White_Space, r)
}
// IsSymbol reports whether the rune is a symbolic character.
func IsSymbol(r rune) bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pS != 0
}
return isExcludingLatin(Symbol, r)
}

122
src/unicode/graphic_test.go Normal file
View File

@@ -0,0 +1,122 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode_test
import (
"testing"
. "unicode"
)
// Independently check that the special "Is" functions work
// in the Latin-1 range through the property table.
func TestIsControlLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsControl(i)
want := false
switch {
case 0x00 <= i && i <= 0x1F:
want = true
case 0x7F <= i && i <= 0x9F:
want = true
}
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsLetterLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsLetter(i)
want := Is(Letter, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsUpperLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsUpper(i)
want := Is(Upper, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsLowerLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsLower(i)
want := Is(Lower, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestNumberLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsNumber(i)
want := Is(Number, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsPrintLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsPrint(i)
want := In(i, PrintRanges...)
if i == ' ' {
want = true
}
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsGraphicLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsGraphic(i)
want := In(i, GraphicRanges...)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsPunctLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsPunct(i)
want := Is(Punct, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsSpaceLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsSpace(i)
want := Is(White_Space, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}
func TestIsSymbolLatin1(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
got := IsSymbol(i)
want := Is(Symbol, i)
if got != want {
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
}
}
}

371
src/unicode/letter.go Normal file
View File

@@ -0,0 +1,371 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package unicode provides data and functions to test some properties of
// Unicode code points.
package unicode
const (
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
ReplacementChar = '\uFFFD' // Represents invalid code points.
MaxASCII = '\u007F' // maximum ASCII value.
MaxLatin1 = '\u00FF' // maximum Latin-1 value.
)
// RangeTable defines a set of Unicode code points by listing the ranges of
// code points within the set. The ranges are listed in two slices
// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
// The two slices must be in sorted order and non-overlapping.
// Also, R32 should contain only values >= 0x10000 (1<<16).
type RangeTable struct {
R16 []Range16
R32 []Range32
LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
}
// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
// inclusive and has the specified stride.
type Range16 struct {
Lo uint16
Hi uint16
Stride uint16
}
// Range32 represents of a range of Unicode code points and is used when one or
// more of the values will not fit in 16 bits. The range runs from Lo to Hi
// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
type Range32 struct {
Lo uint32
Hi uint32
Stride uint32
}
// CaseRange represents a range of Unicode code points for simple (one
// code point to one code point) case conversion.
// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
// are the number to add to the code point to reach the code point for a
// different case for that character. They may be negative. If zero, it
// means the character is in the corresponding case. There is a special
// case representing sequences of alternating corresponding Upper and Lower
// pairs. It appears with a fixed Delta of
//
// {UpperLower, UpperLower, UpperLower}
//
// The constant UpperLower has an otherwise impossible delta value.
type CaseRange struct {
Lo uint32
Hi uint32
Delta d
}
// SpecialCase represents language-specific case mappings such as Turkish.
// Methods of SpecialCase customize (by overriding) the standard mappings.
type SpecialCase []CaseRange
// BUG(r): There is no mechanism for full case folding, that is, for
// characters that involve multiple runes in the input or output.
// Indices into the Delta arrays inside CaseRanges for case mapping.
const (
UpperCase = iota
LowerCase
TitleCase
MaxCase
)
type d [MaxCase]rune // to make the CaseRanges text shorter
// If the Delta field of a [CaseRange] is UpperLower, it means
// this CaseRange represents a sequence of the form (say)
// [Upper] [Lower] [Upper] [Lower].
const (
UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
)
// linearMax is the maximum size table for linear search for non-Latin1 rune.
// Derived by running 'go test -calibrate'.
const linearMax = 18
// is16 reports whether r is in the sorted slice of 16-bit ranges.
func is16(ranges []Range16, r uint16) bool {
if len(ranges) <= linearMax || r <= MaxLatin1 {
for i := range ranges {
range_ := &ranges[i]
if r < range_.Lo {
return false
}
if r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
}
return false
}
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := int(uint(lo+hi) >> 1)
range_ := &ranges[m]
if range_.Lo <= r && r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
if r < range_.Lo {
hi = m
} else {
lo = m + 1
}
}
return false
}
// is32 reports whether r is in the sorted slice of 32-bit ranges.
func is32(ranges []Range32, r uint32) bool {
if len(ranges) <= linearMax {
for i := range ranges {
range_ := &ranges[i]
if r < range_.Lo {
return false
}
if r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
}
return false
}
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := int(uint(lo+hi) >> 1)
range_ := ranges[m]
if range_.Lo <= r && r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
if r < range_.Lo {
hi = m
} else {
lo = m + 1
}
}
return false
}
// Is reports whether the rune is in the specified table of ranges.
func Is(rangeTab *RangeTable, r rune) bool {
r16 := rangeTab.R16
// Compare as uint32 to correctly handle negative runes.
if len(r16) > 0 && uint32(r) <= uint32(r16[len(r16)-1].Hi) {
return is16(r16, uint16(r))
}
r32 := rangeTab.R32
if len(r32) > 0 && r >= rune(r32[0].Lo) {
return is32(r32, uint32(r))
}
return false
}
func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
r16 := rangeTab.R16
// Compare as uint32 to correctly handle negative runes.
if off := rangeTab.LatinOffset; len(r16) > off && uint32(r) <= uint32(r16[len(r16)-1].Hi) {
return is16(r16[off:], uint16(r))
}
r32 := rangeTab.R32
if len(r32) > 0 && r >= rune(r32[0].Lo) {
return is32(r32, uint32(r))
}
return false
}
// IsUpper reports whether the rune is an upper case letter.
func IsUpper(r rune) bool {
// See comment in IsGraphic.
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pLmask == pLu
}
return isExcludingLatin(Upper, r)
}
// IsLower reports whether the rune is a lower case letter.
func IsLower(r rune) bool {
// See comment in IsGraphic.
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pLmask == pLl
}
return isExcludingLatin(Lower, r)
}
// IsTitle reports whether the rune is a title case letter.
func IsTitle(r rune) bool {
if r <= MaxLatin1 {
return false
}
return isExcludingLatin(Title, r)
}
// to maps the rune using the specified case mapping.
// It additionally reports whether caseRange contained a mapping for r.
func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) {
if _case < 0 || MaxCase <= _case {
return ReplacementChar, false // as reasonable an error as any
}
// binary search over ranges
lo := 0
hi := len(caseRange)
for lo < hi {
m := int(uint(lo+hi) >> 1)
cr := caseRange[m]
if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
delta := cr.Delta[_case]
if delta > MaxRune {
// In an Upper-Lower sequence, which always starts with
// an UpperCase letter, the real deltas always look like:
// {0, 1, 0} UpperCase (Lower is next)
// {-1, 0, -1} LowerCase (Upper, Title are previous)
// The characters at even offsets from the beginning of the
// sequence are upper case; the ones at odd offsets are lower.
// The correct mapping can be done by clearing or setting the low
// bit in the sequence offset.
// The constants UpperCase and TitleCase are even while LowerCase
// is odd so we take the low bit from _case.
return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true
}
return r + delta, true
}
if r < rune(cr.Lo) {
hi = m
} else {
lo = m + 1
}
}
return r, false
}
// To maps the rune to the specified case: [UpperCase], [LowerCase], or [TitleCase].
func To(_case int, r rune) rune {
r, _ = to(_case, r, CaseRanges)
return r
}
// ToUpper maps the rune to upper case.
func ToUpper(r rune) rune {
if r <= MaxASCII {
if 'a' <= r && r <= 'z' {
r -= 'a' - 'A'
}
return r
}
return To(UpperCase, r)
}
// ToLower maps the rune to lower case.
func ToLower(r rune) rune {
if r <= MaxASCII {
if 'A' <= r && r <= 'Z' {
r += 'a' - 'A'
}
return r
}
return To(LowerCase, r)
}
// ToTitle maps the rune to title case.
func ToTitle(r rune) rune {
if r <= MaxASCII {
if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
r -= 'a' - 'A'
}
return r
}
return To(TitleCase, r)
}
// ToUpper maps the rune to upper case giving priority to the special mapping.
func (special SpecialCase) ToUpper(r rune) rune {
r1, hadMapping := to(UpperCase, r, []CaseRange(special))
if r1 == r && !hadMapping {
r1 = ToUpper(r)
}
return r1
}
// ToTitle maps the rune to title case giving priority to the special mapping.
func (special SpecialCase) ToTitle(r rune) rune {
r1, hadMapping := to(TitleCase, r, []CaseRange(special))
if r1 == r && !hadMapping {
r1 = ToTitle(r)
}
return r1
}
// ToLower maps the rune to lower case giving priority to the special mapping.
func (special SpecialCase) ToLower(r rune) rune {
r1, hadMapping := to(LowerCase, r, []CaseRange(special))
if r1 == r && !hadMapping {
r1 = ToLower(r)
}
return r1
}
// caseOrbit is defined in tables.go as []foldPair. Right now all the
// entries fit in uint16, so use uint16. If that changes, compilation
// will fail (the constants in the composite literal will not fit in uint16)
// and the types here can change to uint32.
type foldPair struct {
From uint16
To uint16
}
// SimpleFold iterates over Unicode code points equivalent under
// the Unicode-defined simple case folding. Among the code points
// equivalent to rune (including rune itself), SimpleFold returns the
// smallest rune > r if one exists, or else the smallest rune >= 0.
// If r is not a valid Unicode code point, SimpleFold(r) returns r.
//
// For example:
//
// SimpleFold('A') = 'a'
// SimpleFold('a') = 'A'
//
// SimpleFold('K') = 'k'
// SimpleFold('k') = '\u212A' (Kelvin symbol, )
// SimpleFold('\u212A') = 'K'
//
// SimpleFold('1') = '1'
//
// SimpleFold(-2) = -2
func SimpleFold(r rune) rune {
if r < 0 || r > MaxRune {
return r
}
if int(r) < len(asciiFold) {
return rune(asciiFold[r])
}
// Consult caseOrbit table for special cases.
lo := 0
hi := len(caseOrbit)
for lo < hi {
m := int(uint(lo+hi) >> 1)
if rune(caseOrbit[m].From) < r {
lo = m + 1
} else {
hi = m
}
}
if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
return rune(caseOrbit[lo].To)
}
// No folding specified. This is a one- or two-element
// equivalence class containing rune and ToLower(rune)
// and ToUpper(rune) if they are different from rune.
if l := ToLower(r); l != r {
return l
}
return ToUpper(r)
}

644
src/unicode/letter_test.go Normal file
View File

@@ -0,0 +1,644 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode_test
import (
"flag"
"fmt"
"runtime"
"sort"
"strings"
"testing"
. "unicode"
)
var upperTest = []rune{
0x41,
0xc0,
0xd8,
0x100,
0x139,
0x14a,
0x178,
0x181,
0x376,
0x3cf,
0x13bd,
0x1f2a,
0x2102,
0x2c00,
0x2c10,
0x2c20,
0xa650,
0xa722,
0xff3a,
0x10400,
0x1d400,
0x1d7ca,
}
var notupperTest = []rune{
0x40,
0x5b,
0x61,
0x185,
0x1b0,
0x377,
0x387,
0x2150,
0xab7d,
0xffff,
0x10000,
}
var letterTest = []rune{
0x41,
0x61,
0xaa,
0xba,
0xc8,
0xdb,
0xf9,
0x2ec,
0x535,
0x620,
0x6e6,
0x93d,
0xa15,
0xb99,
0xdc0,
0xedd,
0x1000,
0x1200,
0x1312,
0x1401,
0x2c00,
0xa800,
0xf900,
0xfa30,
0xffda,
0xffdc,
0x10000,
0x10300,
0x10400,
0x20000,
0x2f800,
0x2fa1d,
}
var notletterTest = []rune{
0x20,
0x35,
0x375,
0x619,
0x700,
0x1885,
0xfffe,
0x1ffff,
0x10ffff,
}
// Contains all the special cased Latin-1 chars.
var spaceTest = []rune{
0x09,
0x0a,
0x0b,
0x0c,
0x0d,
0x20,
0x85,
0xA0,
0x2000,
0x3000,
}
type caseT struct {
cas int
in, out rune
}
var caseTest = []caseT{
// errors
{-1, '\n', 0xFFFD},
{UpperCase, -1, -1},
{UpperCase, 1 << 30, 1 << 30},
// ASCII (special-cased so test carefully)
{UpperCase, '\n', '\n'},
{UpperCase, 'a', 'A'},
{UpperCase, 'A', 'A'},
{UpperCase, '7', '7'},
{LowerCase, '\n', '\n'},
{LowerCase, 'a', 'a'},
{LowerCase, 'A', 'a'},
{LowerCase, '7', '7'},
{TitleCase, '\n', '\n'},
{TitleCase, 'a', 'A'},
{TitleCase, 'A', 'A'},
{TitleCase, '7', '7'},
// Latin-1: easy to read the tests!
{UpperCase, 0x80, 0x80},
{UpperCase, 'Å', 'Å'},
{UpperCase, 'å', 'Å'},
{LowerCase, 0x80, 0x80},
{LowerCase, 'Å', 'å'},
{LowerCase, 'å', 'å'},
{TitleCase, 0x80, 0x80},
{TitleCase, 'Å', 'Å'},
{TitleCase, 'å', 'Å'},
// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
{UpperCase, 0x0131, 'I'},
{LowerCase, 0x0131, 0x0131},
{TitleCase, 0x0131, 'I'},
// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
{UpperCase, 0x0133, 0x0132},
{LowerCase, 0x0133, 0x0133},
{TitleCase, 0x0133, 0x0132},
// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
{UpperCase, 0x212A, 0x212A},
{LowerCase, 0x212A, 'k'},
{TitleCase, 0x212A, 0x212A},
// From an UpperLower sequence
// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
{UpperCase, 0xA640, 0xA640},
{LowerCase, 0xA640, 0xA641},
{TitleCase, 0xA640, 0xA640},
// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
{UpperCase, 0xA641, 0xA640},
{LowerCase, 0xA641, 0xA641},
{TitleCase, 0xA641, 0xA640},
// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
{UpperCase, 0xA64E, 0xA64E},
{LowerCase, 0xA64E, 0xA64F},
{TitleCase, 0xA64E, 0xA64E},
// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
{UpperCase, 0xA65F, 0xA65E},
{LowerCase, 0xA65F, 0xA65F},
{TitleCase, 0xA65F, 0xA65E},
// From another UpperLower sequence
// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
{UpperCase, 0x0139, 0x0139},
{LowerCase, 0x0139, 0x013A},
{TitleCase, 0x0139, 0x0139},
// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
{UpperCase, 0x013f, 0x013f},
{LowerCase, 0x013f, 0x0140},
{TitleCase, 0x013f, 0x013f},
// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
{UpperCase, 0x0148, 0x0147},
{LowerCase, 0x0148, 0x0148},
{TitleCase, 0x0148, 0x0147},
// Lowercase lower than uppercase.
// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
{UpperCase, 0xab78, 0x13a8},
{LowerCase, 0xab78, 0xab78},
{TitleCase, 0xab78, 0x13a8},
{UpperCase, 0x13a8, 0x13a8},
{LowerCase, 0x13a8, 0xab78},
{TitleCase, 0x13a8, 0x13a8},
// Last block in the 5.1.0 table
// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
{UpperCase, 0x10400, 0x10400},
{LowerCase, 0x10400, 0x10428},
{TitleCase, 0x10400, 0x10400},
// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
{UpperCase, 0x10427, 0x10427},
{LowerCase, 0x10427, 0x1044F},
{TitleCase, 0x10427, 0x10427},
// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
{UpperCase, 0x10428, 0x10400},
{LowerCase, 0x10428, 0x10428},
{TitleCase, 0x10428, 0x10400},
// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
{UpperCase, 0x1044F, 0x10427},
{LowerCase, 0x1044F, 0x1044F},
{TitleCase, 0x1044F, 0x10427},
// First one not in the 5.1.0 table
// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
{UpperCase, 0x10450, 0x10450},
{LowerCase, 0x10450, 0x10450},
{TitleCase, 0x10450, 0x10450},
// Non-letters with case.
{LowerCase, 0x2161, 0x2171},
{UpperCase, 0x0345, 0x0399},
}
func TestIsLetter(t *testing.T) {
for _, r := range upperTest {
if !IsLetter(r) {
t.Errorf("IsLetter(U+%04X) = false, want true", r)
}
}
for _, r := range letterTest {
if !IsLetter(r) {
t.Errorf("IsLetter(U+%04X) = false, want true", r)
}
}
for _, r := range notletterTest {
if IsLetter(r) {
t.Errorf("IsLetter(U+%04X) = true, want false", r)
}
}
}
func TestIsUpper(t *testing.T) {
for _, r := range upperTest {
if !IsUpper(r) {
t.Errorf("IsUpper(U+%04X) = false, want true", r)
}
}
for _, r := range notupperTest {
if IsUpper(r) {
t.Errorf("IsUpper(U+%04X) = true, want false", r)
}
}
for _, r := range notletterTest {
if IsUpper(r) {
t.Errorf("IsUpper(U+%04X) = true, want false", r)
}
}
}
func caseString(c int) string {
switch c {
case UpperCase:
return "UpperCase"
case LowerCase:
return "LowerCase"
case TitleCase:
return "TitleCase"
}
return "ErrorCase"
}
func TestTo(t *testing.T) {
for _, c := range caseTest {
r := To(c.cas, c.in)
if c.out != r {
t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out)
}
}
}
func TestToUpperCase(t *testing.T) {
for _, c := range caseTest {
if c.cas != UpperCase {
continue
}
r := ToUpper(c.in)
if c.out != r {
t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
}
}
}
func TestToLowerCase(t *testing.T) {
for _, c := range caseTest {
if c.cas != LowerCase {
continue
}
r := ToLower(c.in)
if c.out != r {
t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
}
}
}
func TestToTitleCase(t *testing.T) {
for _, c := range caseTest {
if c.cas != TitleCase {
continue
}
r := ToTitle(c.in)
if c.out != r {
t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
}
}
}
func TestIsSpace(t *testing.T) {
for _, c := range spaceTest {
if !IsSpace(c) {
t.Errorf("IsSpace(U+%04X) = false; want true", c)
}
}
for _, c := range letterTest {
if IsSpace(c) {
t.Errorf("IsSpace(U+%04X) = true; want false", c)
}
}
}
// Check that the optimizations for IsLetter etc. agree with the tables.
// We only need to check the Latin-1 range.
func TestLetterOptimizations(t *testing.T) {
for i := rune(0); i <= MaxLatin1; i++ {
if Is(Letter, i) != IsLetter(i) {
t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i)
}
if Is(Upper, i) != IsUpper(i) {
t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i)
}
if Is(Lower, i) != IsLower(i) {
t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i)
}
if Is(Title, i) != IsTitle(i) {
t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i)
}
if Is(White_Space, i) != IsSpace(i) {
t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i)
}
if To(UpperCase, i) != ToUpper(i) {
t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i)
}
if To(LowerCase, i) != ToLower(i) {
t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i)
}
if To(TitleCase, i) != ToTitle(i) {
t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i)
}
}
}
func TestTurkishCase(t *testing.T) {
lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz")
upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ")
for i, l := range lower {
u := upper[i]
if TurkishCase.ToLower(l) != l {
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
}
if TurkishCase.ToUpper(u) != u {
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
}
if TurkishCase.ToUpper(l) != u {
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
}
if TurkishCase.ToLower(u) != l {
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
}
if TurkishCase.ToTitle(u) != u {
t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
}
if TurkishCase.ToTitle(l) != u {
t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
}
}
}
var simpleFoldTests = []string{
// SimpleFold(x) returns the next equivalent rune > x or wraps
// around to smaller values.
// Easy cases.
"Aa",
"δΔ",
// ASCII special cases.
"Kk",
"Ssſ",
// Non-ASCII special cases.
"ρϱΡ",
"ͅΙιι",
// Extra special cases: has lower/upper but no case fold.
"İ",
"ı",
// Upper comes before lower (Cherokee).
"\u13b0\uab80",
}
func TestSimpleFold(t *testing.T) {
for _, tt := range simpleFoldTests {
cycle := []rune(tt)
r := cycle[len(cycle)-1]
for _, out := range cycle {
if r := SimpleFold(r); r != out {
t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out)
}
r = out
}
}
if r := SimpleFold(-42); r != -42 {
t.Errorf("SimpleFold(-42) = %v, want -42", r)
}
}
// Running 'go test -calibrate' runs the calibration to find a plausible
// cutoff point for linear search of a range list vs. binary search.
// We create a fake table and then time how long it takes to do a
// sequence of searches within that table, for all possible inputs
// relative to the ranges (something before all, in each, between each, after all).
// This assumes that all possible runes are equally likely.
// In practice most runes are ASCII so this is a conservative estimate
// of an effective cutoff value. In practice we could probably set it higher
// than what this function recommends.
var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search")
func TestCalibrate(t *testing.T) {
if !*calibrate {
return
}
if runtime.GOARCH == "amd64" {
fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH)
}
// Find the point where binary search wins by more than 10%.
// The 10% bias gives linear search an edge when they're close,
// because on predominantly ASCII inputs linear search is even
// better than our benchmarks measure.
n := sort.Search(64, func(n int) bool {
tab := fakeTable(n)
blinear := func(b *testing.B) {
tab := tab
max := n*5 + 20
for i := 0; i < b.N; i++ {
for j := 0; j <= max; j++ {
linear(tab, uint16(j))
}
}
}
bbinary := func(b *testing.B) {
tab := tab
max := n*5 + 20
for i := 0; i < b.N; i++ {
for j := 0; j <= max; j++ {
binary(tab, uint16(j))
}
}
}
bmlinear := testing.Benchmark(blinear)
bmbinary := testing.Benchmark(bbinary)
fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp())
return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110
})
fmt.Printf("calibration: linear cutoff = %d\n", n)
}
func fakeTable(n int) []Range16 {
var r16 []Range16
for i := 0; i < n; i++ {
r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1})
}
return r16
}
func linear(ranges []Range16, r uint16) bool {
for i := range ranges {
range_ := &ranges[i]
if r < range_.Lo {
return false
}
if r <= range_.Hi {
return (r-range_.Lo)%range_.Stride == 0
}
}
return false
}
func binary(ranges []Range16, r uint16) bool {
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := int(uint(lo+hi) >> 1)
range_ := &ranges[m]
if range_.Lo <= r && r <= range_.Hi {
return (r-range_.Lo)%range_.Stride == 0
}
if r < range_.Lo {
hi = m
} else {
lo = m + 1
}
}
return false
}
func TestLatinOffset(t *testing.T) {
var maps = []map[string]*RangeTable{
Categories,
FoldCategory,
FoldScript,
Properties,
Scripts,
}
for _, m := range maps {
for name, tab := range m {
i := 0
for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 {
i++
}
if tab.LatinOffset != i {
t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i)
}
}
}
}
func TestSpecialCaseNoMapping(t *testing.T) {
// Issue 25636
// no change for rune 'A', zero delta, under upper/lower/title case change.
var noChangeForCapitalA = CaseRange{'A', 'A', [MaxCase]rune{0, 0, 0}}
got := strings.ToLowerSpecial(SpecialCase([]CaseRange{noChangeForCapitalA}), "ABC")
want := "Abc"
if got != want {
t.Errorf("got %q; want %q", got, want)
}
}
func TestNegativeRune(t *testing.T) {
// Issue 43254
// These tests cover negative rune handling by testing values which,
// when cast to uint8 or uint16, look like a particular valid rune.
// This package has Latin-1-specific optimizations, so we test all of
// Latin-1 and representative non-Latin-1 values in the character
// categories covered by IsGraphic, etc.
nonLatin1 := []uint32{
// Lu: LATIN CAPITAL LETTER A WITH MACRON
0x0100,
// Ll: LATIN SMALL LETTER A WITH MACRON
0x0101,
// Lt: LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
0x01C5,
// M: COMBINING GRAVE ACCENT
0x0300,
// Nd: ARABIC-INDIC DIGIT ZERO
0x0660,
// P: GREEK QUESTION MARK
0x037E,
// S: MODIFIER LETTER LEFT ARROWHEAD
0x02C2,
// Z: OGHAM SPACE MARK
0x1680,
}
for i := 0; i < MaxLatin1+len(nonLatin1); i++ {
base := uint32(i)
if i >= MaxLatin1 {
base = nonLatin1[i-MaxLatin1]
}
// Note r is negative, but uint8(r) == uint8(base) and
// uint16(r) == uint16(base).
r := rune(base - 1<<31)
if Is(Letter, r) {
t.Errorf("Is(Letter, 0x%x - 1<<31) = true, want false", base)
}
if IsControl(r) {
t.Errorf("IsControl(0x%x - 1<<31) = true, want false", base)
}
if IsDigit(r) {
t.Errorf("IsDigit(0x%x - 1<<31) = true, want false", base)
}
if IsGraphic(r) {
t.Errorf("IsGraphic(0x%x - 1<<31) = true, want false", base)
}
if IsLetter(r) {
t.Errorf("IsLetter(0x%x - 1<<31) = true, want false", base)
}
if IsLower(r) {
t.Errorf("IsLower(0x%x - 1<<31) = true, want false", base)
}
if IsMark(r) {
t.Errorf("IsMark(0x%x - 1<<31) = true, want false", base)
}
if IsNumber(r) {
t.Errorf("IsNumber(0x%x - 1<<31) = true, want false", base)
}
if IsPrint(r) {
t.Errorf("IsPrint(0x%x - 1<<31) = true, want false", base)
}
if IsPunct(r) {
t.Errorf("IsPunct(0x%x - 1<<31) = true, want false", base)
}
if IsSpace(r) {
t.Errorf("IsSpace(0x%x - 1<<31) = true, want false", base)
}
if IsSymbol(r) {
t.Errorf("IsSymbol(0x%x - 1<<31) = true, want false", base)
}
if IsTitle(r) {
t.Errorf("IsTitle(0x%x - 1<<31) = true, want false", base)
}
if IsUpper(r) {
t.Errorf("IsUpper(0x%x - 1<<31) = true, want false", base)
}
}
}

131
src/unicode/script_test.go Normal file
View File

@@ -0,0 +1,131 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode_test
import (
"testing"
. "unicode"
)
type T struct {
rune rune
script string
}
var inCategoryTest = []T{
{0x0081, "Cc"},
{0x200B, "Cf"},
{0xf0000, "Co"},
{0xdb80, "Cs"},
{0x0236, "Ll"},
{0x1d9d, "Lm"},
{0x07cf, "Lo"},
{0x1f8a, "Lt"},
{0x03ff, "Lu"},
{0x0bc1, "Mc"},
{0x20df, "Me"},
{0x07f0, "Mn"},
{0x1bb2, "Nd"},
{0x10147, "Nl"},
{0x2478, "No"},
{0xfe33, "Pc"},
{0x2011, "Pd"},
{0x301e, "Pe"},
{0x2e03, "Pf"},
{0x2e02, "Pi"},
{0x0022, "Po"},
{0x2770, "Ps"},
{0x00a4, "Sc"},
{0xa711, "Sk"},
{0x25f9, "Sm"},
{0x2108, "So"},
{0x2028, "Zl"},
{0x2029, "Zp"},
{0x202f, "Zs"},
// Unifieds.
{0x04aa, "L"},
{0x0009, "C"},
{0x1712, "M"},
{0x0031, "N"},
{0x00bb, "P"},
{0x00a2, "S"},
{0x00a0, "Z"},
}
var inPropTest = []T{
{0x0046, "ASCII_Hex_Digit"},
{0x200F, "Bidi_Control"},
{0x2212, "Dash"},
{0xE0001, "Deprecated"},
{0x00B7, "Diacritic"},
{0x30FE, "Extender"},
{0xFF46, "Hex_Digit"},
{0x2E17, "Hyphen"},
{0x2FFB, "IDS_Binary_Operator"},
{0x2FF3, "IDS_Trinary_Operator"},
{0xFA6A, "Ideographic"},
{0x200D, "Join_Control"},
{0x0EC4, "Logical_Order_Exception"},
{0x2FFFF, "Noncharacter_Code_Point"},
{0x065E, "Other_Alphabetic"},
{0x2065, "Other_Default_Ignorable_Code_Point"},
{0x0BD7, "Other_Grapheme_Extend"},
{0x0387, "Other_ID_Continue"},
{0x212E, "Other_ID_Start"},
{0x2094, "Other_Lowercase"},
{0x2040, "Other_Math"},
{0x216F, "Other_Uppercase"},
{0x0027, "Pattern_Syntax"},
{0x0020, "Pattern_White_Space"},
{0x06DD, "Prepended_Concatenation_Mark"},
{0x300D, "Quotation_Mark"},
{0x2EF3, "Radical"},
{0x1f1ff, "Regional_Indicator"},
{0x061F, "STerm"}, // Deprecated alias of Sentence_Terminal
{0x061F, "Sentence_Terminal"},
{0x2071, "Soft_Dotted"},
{0x003A, "Terminal_Punctuation"},
{0x9FC3, "Unified_Ideograph"},
{0xFE0F, "Variation_Selector"},
{0x0020, "White_Space"},
}
func TestCategories(t *testing.T) {
notTested := make(map[string]bool)
for k := range Categories {
notTested[k] = true
}
for _, test := range inCategoryTest {
if _, ok := Categories[test.script]; !ok {
t.Fatal(test.script, "not a known category")
}
if !Is(Categories[test.script], test.rune) {
t.Errorf("IsCategory(%U, %s) = false, want true", test.rune, test.script)
}
delete(notTested, test.script)
}
for k := range notTested {
t.Error("category not tested:", k)
}
}
func TestProperties(t *testing.T) {
notTested := make(map[string]bool)
for k := range Properties {
notTested[k] = true
}
for _, test := range inPropTest {
if _, ok := Properties[test.script]; !ok {
t.Fatal(test.script, "not a known prop")
}
if !Is(Properties[test.script], test.rune) {
t.Errorf("IsCategory(%U, %s) = false, want true", test.rune, test.script)
}
delete(notTested, test.script)
}
for k := range notTested {
t.Error("property not tested:", k)
}
}

8378
src/unicode/tables.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package utf16
// Extra names for constants so we can validate them during testing.
const (
Surr1 = surr1
Surr3 = surr3
SurrSelf = surrSelf
MaxRune = maxRune
ReplacementChar = replacementChar
)

144
src/unicode/utf16/utf16.go Normal file
View File

@@ -0,0 +1,144 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf16 implements encoding and decoding of UTF-16 sequences.
package utf16
// The conditions replacementChar==unicode.ReplacementChar and
// maxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.
const (
replacementChar = '\uFFFD' // Unicode replacement character
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
)
const (
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
// the value is those 20 bits plus 0x10000.
surr1 = 0xd800
surr2 = 0xdc00
surr3 = 0xe000
surrSelf = 0x10000
)
// IsSurrogate reports whether the specified Unicode code point
// can appear in a surrogate pair.
func IsSurrogate(r rune) bool {
return surr1 <= r && r < surr3
}
// DecodeRune returns the UTF-16 decoding of a surrogate pair.
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 rune) rune {
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
}
return replacementChar
}
// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// If the rune is not a valid Unicode code point or does not need encoding,
// EncodeRune returns U+FFFD, U+FFFD.
func EncodeRune(r rune) (r1, r2 rune) {
if r < surrSelf || r > maxRune {
return replacementChar, replacementChar
}
r -= surrSelf
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
}
// RuneLen returns the number of 16-bit words in the UTF-16 encoding of the rune.
// It returns -1 if the rune is not a valid value to encode in UTF-16.
func RuneLen(r rune) int {
switch {
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
return 1
case surrSelf <= r && r <= maxRune:
return 2
default:
return -1
}
}
// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 {
n := len(s)
for _, v := range s {
if v >= surrSelf {
n++
}
}
a := make([]uint16, n)
n = 0
for _, v := range s {
switch RuneLen(v) {
case 1: // normal rune
a[n] = uint16(v)
n++
case 2: // needs surrogate sequence
r1, r2 := EncodeRune(v)
a[n] = uint16(r1)
a[n+1] = uint16(r2)
n += 2
default:
a[n] = uint16(replacementChar)
n++
}
}
return a[:n]
}
// AppendRune appends the UTF-16 encoding of the Unicode code point r
// to the end of p and returns the extended buffer. If the rune is not
// a valid Unicode code point, it appends the encoding of U+FFFD.
func AppendRune(a []uint16, r rune) []uint16 {
// This function is inlineable for fast handling of ASCII.
switch {
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
// normal rune
return append(a, uint16(r))
case surrSelf <= r && r <= maxRune:
// needs surrogate sequence
r1, r2 := EncodeRune(r)
return append(a, uint16(r1), uint16(r2))
}
return append(a, replacementChar)
}
// Decode returns the Unicode code point sequence represented
// by the UTF-16 encoding s.
func Decode(s []uint16) []rune {
// Preallocate capacity to hold up to 64 runes.
// Decode inlines, so the allocation can live on the stack.
buf := make([]rune, 0, 64)
return decode(s, buf)
}
// decode appends to buf the Unicode code point sequence represented
// by the UTF-16 encoding s and return the extended buffer.
func decode(s []uint16, buf []rune) []rune {
for i := 0; i < len(s); i++ {
var ar rune
switch r := s[i]; {
case r < surr1, surr3 <= r:
// normal rune
ar = rune(r)
case surr1 <= r && r < surr2 && i+1 < len(s) &&
surr2 <= s[i+1] && s[i+1] < surr3:
// valid surrogate sequence
ar = DecodeRune(rune(r), rune(s[i+1]))
i++
default:
// invalid surrogate sequence
ar = replacementChar
}
buf = append(buf, ar)
}
return buf
}

View File

@@ -0,0 +1,273 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package utf16_test
import (
"internal/testenv"
"reflect"
"testing"
"unicode"
. "unicode/utf16"
)
// Validate the constants redefined from unicode.
func TestConstants(t *testing.T) {
if MaxRune != unicode.MaxRune {
t.Errorf("utf16.maxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
}
if ReplacementChar != unicode.ReplacementChar {
t.Errorf("utf16.replacementChar is wrong: %x should be %x", ReplacementChar, unicode.ReplacementChar)
}
}
func TestRuneLen(t *testing.T) {
for _, tt := range []struct {
r rune
length int
}{
{0, 1},
{Surr1 - 1, 1},
{Surr3, 1},
{SurrSelf - 1, 1},
{SurrSelf, 2},
{MaxRune, 2},
{MaxRune + 1, -1},
{-1, -1},
} {
if length := RuneLen(tt.r); length != tt.length {
t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, length, tt.length)
}
}
}
type encodeTest struct {
in []rune
out []uint16
}
var encodeTests = []encodeTest{
{[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}},
{[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff}},
{[]rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1},
[]uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd}},
}
func TestEncode(t *testing.T) {
for _, tt := range encodeTests {
out := Encode(tt.in)
if !reflect.DeepEqual(out, tt.out) {
t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}
func TestAppendRune(t *testing.T) {
for _, tt := range encodeTests {
var out []uint16
for _, u := range tt.in {
out = AppendRune(out, u)
}
if !reflect.DeepEqual(out, tt.out) {
t.Errorf("AppendRune(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}
func TestEncodeRune(t *testing.T) {
for i, tt := range encodeTests {
j := 0
for _, r := range tt.in {
r1, r2 := EncodeRune(r)
if r < 0x10000 || r > unicode.MaxRune {
if j >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
}
j++
} else {
if j+1 >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
}
j += 2
dec := DecodeRune(r1, r2)
if dec != r {
t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
}
}
}
if j != len(tt.out) {
t.Errorf("#%d: EncodeRune didn't generate enough output", i)
}
}
}
type decodeTest struct {
in []uint16
out []rune
}
var decodeTests = []decodeTest{
{[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}},
{[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff}},
{[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}},
{[]uint16{0xdfff}, []rune{0xfffd}},
}
func TestAllocationsDecode(t *testing.T) {
testenv.SkipIfOptimizationOff(t)
for _, tt := range decodeTests {
allocs := testing.AllocsPerRun(10, func() {
out := Decode(tt.in)
if out == nil {
t.Errorf("Decode(%x) = nil", tt.in)
}
})
if allocs > 0 {
t.Errorf("Decode allocated %v times", allocs)
}
}
}
func TestDecode(t *testing.T) {
for _, tt := range decodeTests {
out := Decode(tt.in)
if !reflect.DeepEqual(out, tt.out) {
t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}
var decodeRuneTests = []struct {
r1, r2 rune
want rune
}{
{0xd800, 0xdc00, 0x10000},
{0xd800, 0xdc01, 0x10001},
{0xd808, 0xdf45, 0x12345},
{0xdbff, 0xdfff, 0x10ffff},
{0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted
}
func TestDecodeRune(t *testing.T) {
for i, tt := range decodeRuneTests {
got := DecodeRune(tt.r1, tt.r2)
if got != tt.want {
t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want)
}
}
}
var surrogateTests = []struct {
r rune
want bool
}{
// from https://en.wikipedia.org/wiki/UTF-16
{'\u007A', false}, // LATIN SMALL LETTER Z
{'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water)
{'\uFEFF', false}, // Byte Order Mark
{'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point)
{'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF
{'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point)
{rune(0xd7ff), false}, // surr1-1
{rune(0xd800), true}, // surr1
{rune(0xdc00), true}, // surr2
{rune(0xe000), false}, // surr3
{rune(0xdfff), true}, // surr3-1
}
func TestIsSurrogate(t *testing.T) {
for i, tt := range surrogateTests {
got := IsSurrogate(tt.r)
if got != tt.want {
t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want)
}
}
}
func BenchmarkDecodeValidASCII(b *testing.B) {
// "hello world"
data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100}
for i := 0; i < b.N; i++ {
Decode(data)
}
}
func BenchmarkDecodeValidJapaneseChars(b *testing.B) {
// "日本語日本語日本語"
data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486}
for i := 0; i < b.N; i++ {
Decode(data)
}
}
func BenchmarkDecodeRune(b *testing.B) {
rs := make([]rune, 10)
// U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS
for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
rs[2*i], rs[2*i+1] = EncodeRune(u)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
for j := 0; j < 5; j++ {
DecodeRune(rs[2*j], rs[2*j+1])
}
}
}
func BenchmarkEncodeValidASCII(b *testing.B) {
data := []rune{'h', 'e', 'l', 'l', 'o'}
for i := 0; i < b.N; i++ {
Encode(data)
}
}
func BenchmarkEncodeValidJapaneseChars(b *testing.B) {
data := []rune{'日', '本', '語'}
for i := 0; i < b.N; i++ {
Encode(data)
}
}
func BenchmarkAppendRuneValidASCII(b *testing.B) {
data := []rune{'h', 'e', 'l', 'l', 'o'}
a := make([]uint16, 0, len(data)*2)
for i := 0; i < b.N; i++ {
for _, u := range data {
a = AppendRune(a, u)
}
a = a[:0]
}
}
func BenchmarkAppendRuneValidJapaneseChars(b *testing.B) {
data := []rune{'日', '本', '語'}
a := make([]uint16, 0, len(data)*2)
for i := 0; i < b.N; i++ {
for _, u := range data {
a = AppendRune(a, u)
}
a = a[:0]
}
}
func BenchmarkEncodeRune(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
EncodeRune(u)
}
}
}

View File

@@ -0,0 +1,226 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package utf8_test
import (
"fmt"
"unicode/utf8"
)
func ExampleDecodeLastRune() {
b := []byte("Hello, 世界")
for len(b) > 0 {
r, size := utf8.DecodeLastRune(b)
fmt.Printf("%c %v\n", r, size)
b = b[:len(b)-size]
}
// Output:
// 界 3
// 世 3
// 1
// , 1
// o 1
// l 1
// l 1
// e 1
// H 1
}
func ExampleDecodeLastRuneInString() {
str := "Hello, 世界"
for len(str) > 0 {
r, size := utf8.DecodeLastRuneInString(str)
fmt.Printf("%c %v\n", r, size)
str = str[:len(str)-size]
}
// Output:
// 界 3
// 世 3
// 1
// , 1
// o 1
// l 1
// l 1
// e 1
// H 1
}
func ExampleDecodeRune() {
b := []byte("Hello, 世界")
for len(b) > 0 {
r, size := utf8.DecodeRune(b)
fmt.Printf("%c %v\n", r, size)
b = b[size:]
}
// Output:
// H 1
// e 1
// l 1
// l 1
// o 1
// , 1
// 1
// 世 3
// 界 3
}
func ExampleDecodeRuneInString() {
str := "Hello, 世界"
for len(str) > 0 {
r, size := utf8.DecodeRuneInString(str)
fmt.Printf("%c %v\n", r, size)
str = str[size:]
}
// Output:
// H 1
// e 1
// l 1
// l 1
// o 1
// , 1
// 1
// 世 3
// 界 3
}
func ExampleEncodeRune() {
r := '世'
buf := make([]byte, 3)
n := utf8.EncodeRune(buf, r)
fmt.Println(buf)
fmt.Println(n)
// Output:
// [228 184 150]
// 3
}
func ExampleEncodeRune_outOfRange() {
runes := []rune{
// Less than 0, out of range.
-1,
// Greater than 0x10FFFF, out of range.
0x110000,
// The Unicode replacement character.
utf8.RuneError,
}
for i, c := range runes {
buf := make([]byte, 3)
size := utf8.EncodeRune(buf, c)
fmt.Printf("%d: %d %[2]s %d\n", i, buf, size)
}
// Output:
// 0: [239 191 189] <20> 3
// 1: [239 191 189] <20> 3
// 2: [239 191 189] <20> 3
}
func ExampleFullRune() {
buf := []byte{228, 184, 150} // 世
fmt.Println(utf8.FullRune(buf))
fmt.Println(utf8.FullRune(buf[:2]))
// Output:
// true
// false
}
func ExampleFullRuneInString() {
str := "世"
fmt.Println(utf8.FullRuneInString(str))
fmt.Println(utf8.FullRuneInString(str[:2]))
// Output:
// true
// false
}
func ExampleRuneCount() {
buf := []byte("Hello, 世界")
fmt.Println("bytes =", len(buf))
fmt.Println("runes =", utf8.RuneCount(buf))
// Output:
// bytes = 13
// runes = 9
}
func ExampleRuneCountInString() {
str := "Hello, 世界"
fmt.Println("bytes =", len(str))
fmt.Println("runes =", utf8.RuneCountInString(str))
// Output:
// bytes = 13
// runes = 9
}
func ExampleRuneLen() {
fmt.Println(utf8.RuneLen('a'))
fmt.Println(utf8.RuneLen('界'))
// Output:
// 1
// 3
}
func ExampleRuneStart() {
buf := []byte("a界")
fmt.Println(utf8.RuneStart(buf[0]))
fmt.Println(utf8.RuneStart(buf[1]))
fmt.Println(utf8.RuneStart(buf[2]))
// Output:
// true
// true
// false
}
func ExampleValid() {
valid := []byte("Hello, 世界")
invalid := []byte{0xff, 0xfe, 0xfd}
fmt.Println(utf8.Valid(valid))
fmt.Println(utf8.Valid(invalid))
// Output:
// true
// false
}
func ExampleValidRune() {
valid := 'a'
invalid := rune(0xfffffff)
fmt.Println(utf8.ValidRune(valid))
fmt.Println(utf8.ValidRune(invalid))
// Output:
// true
// false
}
func ExampleValidString() {
valid := "Hello, 世界"
invalid := string([]byte{0xff, 0xfe, 0xfd})
fmt.Println(utf8.ValidString(valid))
fmt.Println(utf8.ValidString(invalid))
// Output:
// true
// false
}
func ExampleAppendRune() {
buf1 := utf8.AppendRune(nil, 0x10000)
buf2 := utf8.AppendRune([]byte("init"), 0x10000)
fmt.Println(string(buf1))
fmt.Println(string(buf2))
// Output:
// 𐀀
// init𐀀
}

583
src/unicode/utf8/utf8.go Normal file
View File

@@ -0,0 +1,583 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf8 implements functions and constants to support text encoded in
// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
// See https://en.wikipedia.org/wiki/UTF-8
package utf8
// The conditions RuneError==unicode.ReplacementChar and
// MaxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.
// Numbers fundamental to the encoding.
const (
RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
RuneSelf = 0x80 // characters below RuneSelf are represented as themselves in a single byte.
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
)
// Code points in the surrogate range are not valid for UTF-8.
const (
surrogateMin = 0xD800
surrogateMax = 0xDFFF
)
const (
t1 = 0b00000000
tx = 0b10000000
t2 = 0b11000000
t3 = 0b11100000
t4 = 0b11110000
t5 = 0b11111000
maskx = 0b00111111
mask2 = 0b00011111
mask3 = 0b00001111
mask4 = 0b00000111
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
// The default lowest and highest continuation byte.
locb = 0b10000000
hicb = 0b10111111
// These names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
s5 = 0x34 // accept 3, size 4
s6 = 0x04 // accept 0, size 4
s7 = 0x44 // accept 4, size 4
)
// first is information about the first byte in a UTF-8 sequence.
var first = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}
// acceptRange gives the range of valid values for the second byte in a UTF-8
// sequence.
type acceptRange struct {
lo uint8 // lowest value for second byte.
hi uint8 // highest value for second byte.
}
// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
var acceptRanges = [16]acceptRange{
0: {locb, hicb},
1: {0xA0, hicb},
2: {locb, 0x9F},
3: {0x90, hicb},
4: {locb, 0x8F},
}
// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
func FullRune(p []byte) bool {
n := len(p)
if n == 0 {
return false
}
x := first[p[0]]
if n >= int(x&7) {
return true // ASCII, invalid or valid.
}
// Must be short or invalid.
accept := acceptRanges[x>>4]
if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
return true
} else if n > 2 && (p[2] < locb || hicb < p[2]) {
return true
}
return false
}
// FullRuneInString is like FullRune but its input is a string.
func FullRuneInString(s string) bool {
n := len(s)
if n == 0 {
return false
}
x := first[s[0]]
if n >= int(x&7) {
return true // ASCII, invalid, or valid.
}
// Must be short or invalid.
accept := acceptRanges[x>>4]
if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
return true
} else if n > 2 && (s[2] < locb || hicb < s[2]) {
return true
}
return false
}
// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
// its width in bytes. If p is empty it returns ([RuneError], 0). Otherwise, if
// the encoding is invalid, it returns (RuneError, 1). Both are impossible
// results for correct, non-empty UTF-8.
//
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
// out of range, or is not the shortest possible UTF-8 encoding for the
// value. No other validation is performed.
func DecodeRune(p []byte) (r rune, size int) {
n := len(p)
if n < 1 {
return RuneError, 0
}
p0 := p[0]
x := first[p0]
if x >= as {
// The following code simulates an additional check for x == xx and
// handling the ASCII and invalid cases accordingly. This mask-and-or
// approach prevents an additional branch.
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
return rune(p[0])&^mask | RuneError&mask, 1
}
sz := int(x & 7)
accept := acceptRanges[x>>4]
if n < sz {
return RuneError, 1
}
b1 := p[1]
if b1 < accept.lo || accept.hi < b1 {
return RuneError, 1
}
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
return rune(p0&mask2)<<6 | rune(b1&maskx), 2
}
b2 := p[2]
if b2 < locb || hicb < b2 {
return RuneError, 1
}
if sz <= 3 {
return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
}
b3 := p[3]
if b3 < locb || hicb < b3 {
return RuneError, 1
}
return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
}
// DecodeRuneInString is like [DecodeRune] but its input is a string. If s is
// empty it returns ([RuneError], 0). Otherwise, if the encoding is invalid, it
// returns (RuneError, 1). Both are impossible results for correct, non-empty
// UTF-8.
//
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
// out of range, or is not the shortest possible UTF-8 encoding for the
// value. No other validation is performed.
func DecodeRuneInString(s string) (r rune, size int) {
n := len(s)
if n < 1 {
return RuneError, 0
}
s0 := s[0]
x := first[s0]
if x >= as {
// The following code simulates an additional check for x == xx and
// handling the ASCII and invalid cases accordingly. This mask-and-or
// approach prevents an additional branch.
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
return rune(s[0])&^mask | RuneError&mask, 1
}
sz := int(x & 7)
accept := acceptRanges[x>>4]
if n < sz {
return RuneError, 1
}
s1 := s[1]
if s1 < accept.lo || accept.hi < s1 {
return RuneError, 1
}
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
return rune(s0&mask2)<<6 | rune(s1&maskx), 2
}
s2 := s[2]
if s2 < locb || hicb < s2 {
return RuneError, 1
}
if sz <= 3 {
return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
}
s3 := s[3]
if s3 < locb || hicb < s3 {
return RuneError, 1
}
return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
}
// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
// its width in bytes. If p is empty it returns ([RuneError], 0). Otherwise, if
// the encoding is invalid, it returns (RuneError, 1). Both are impossible
// results for correct, non-empty UTF-8.
//
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
// out of range, or is not the shortest possible UTF-8 encoding for the
// value. No other validation is performed.
func DecodeLastRune(p []byte) (r rune, size int) {
end := len(p)
if end == 0 {
return RuneError, 0
}
start := end - 1
r = rune(p[start])
if r < RuneSelf {
return r, 1
}
// guard against O(n^2) behavior when traversing
// backwards through strings with long sequences of
// invalid UTF-8.
lim := end - UTFMax
if lim < 0 {
lim = 0
}
for start--; start >= lim; start-- {
if RuneStart(p[start]) {
break
}
}
if start < 0 {
start = 0
}
r, size = DecodeRune(p[start:end])
if start+size != end {
return RuneError, 1
}
return r, size
}
// DecodeLastRuneInString is like [DecodeLastRune] but its input is a string. If
// s is empty it returns ([RuneError], 0). Otherwise, if the encoding is invalid,
// it returns (RuneError, 1). Both are impossible results for correct,
// non-empty UTF-8.
//
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
// out of range, or is not the shortest possible UTF-8 encoding for the
// value. No other validation is performed.
func DecodeLastRuneInString(s string) (r rune, size int) {
end := len(s)
if end == 0 {
return RuneError, 0
}
start := end - 1
r = rune(s[start])
if r < RuneSelf {
return r, 1
}
// guard against O(n^2) behavior when traversing
// backwards through strings with long sequences of
// invalid UTF-8.
lim := end - UTFMax
if lim < 0 {
lim = 0
}
for start--; start >= lim; start-- {
if RuneStart(s[start]) {
break
}
}
if start < 0 {
start = 0
}
r, size = DecodeRuneInString(s[start:end])
if start+size != end {
return RuneError, 1
}
return r, size
}
// RuneLen returns the number of bytes in the UTF-8 encoding of the rune.
// It returns -1 if the rune is not a valid value to encode in UTF-8.
func RuneLen(r rune) int {
switch {
case r < 0:
return -1
case r <= rune1Max:
return 1
case r <= rune2Max:
return 2
case surrogateMin <= r && r <= surrogateMax:
return -1
case r <= rune3Max:
return 3
case r <= MaxRune:
return 4
}
return -1
}
// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
// If the rune is out of range, it writes the encoding of [RuneError].
// It returns the number of bytes written.
func EncodeRune(p []byte, r rune) int {
// Negative values are erroneous. Making it unsigned addresses the problem.
switch i := uint32(r); {
case i <= rune1Max:
p[0] = byte(r)
return 1
case i <= rune2Max:
_ = p[1] // eliminate bounds checks
p[0] = t2 | byte(r>>6)
p[1] = tx | byte(r)&maskx
return 2
case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
r = RuneError
fallthrough
case i <= rune3Max:
_ = p[2] // eliminate bounds checks
p[0] = t3 | byte(r>>12)
p[1] = tx | byte(r>>6)&maskx
p[2] = tx | byte(r)&maskx
return 3
default:
_ = p[3] // eliminate bounds checks
p[0] = t4 | byte(r>>18)
p[1] = tx | byte(r>>12)&maskx
p[2] = tx | byte(r>>6)&maskx
p[3] = tx | byte(r)&maskx
return 4
}
}
// AppendRune appends the UTF-8 encoding of r to the end of p and
// returns the extended buffer. If the rune is out of range,
// it appends the encoding of [RuneError].
func AppendRune(p []byte, r rune) []byte {
// This function is inlineable for fast handling of ASCII.
if uint32(r) <= rune1Max {
return append(p, byte(r))
}
return appendRuneNonASCII(p, r)
}
func appendRuneNonASCII(p []byte, r rune) []byte {
// Negative values are erroneous. Making it unsigned addresses the problem.
switch i := uint32(r); {
case i <= rune2Max:
return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
r = RuneError
fallthrough
case i <= rune3Max:
return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
default:
return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
}
}
// RuneCount returns the number of runes in p. Erroneous and short
// encodings are treated as single runes of width 1 byte.
func RuneCount(p []byte) int {
np := len(p)
var n int
for i := 0; i < np; {
n++
c := p[i]
if c < RuneSelf {
// ASCII fast path
i++
continue
}
x := first[c]
if x == xx {
i++ // invalid.
continue
}
size := int(x & 7)
if i+size > np {
i++ // Short or invalid.
continue
}
accept := acceptRanges[x>>4]
if c := p[i+1]; c < accept.lo || accept.hi < c {
size = 1
} else if size == 2 {
} else if c := p[i+2]; c < locb || hicb < c {
size = 1
} else if size == 3 {
} else if c := p[i+3]; c < locb || hicb < c {
size = 1
}
i += size
}
return n
}
// RuneCountInString is like [RuneCount] but its input is a string.
func RuneCountInString(s string) (n int) {
ns := len(s)
for i := 0; i < ns; n++ {
c := s[i]
if c < RuneSelf {
// ASCII fast path
i++
continue
}
x := first[c]
if x == xx {
i++ // invalid.
continue
}
size := int(x & 7)
if i+size > ns {
i++ // Short or invalid.
continue
}
accept := acceptRanges[x>>4]
if c := s[i+1]; c < accept.lo || accept.hi < c {
size = 1
} else if size == 2 {
} else if c := s[i+2]; c < locb || hicb < c {
size = 1
} else if size == 3 {
} else if c := s[i+3]; c < locb || hicb < c {
size = 1
}
i += size
}
return n
}
// RuneStart reports whether the byte could be the first byte of an encoded,
// possibly invalid rune. Second and subsequent bytes always have the top two
// bits set to 10.
func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
func Valid(p []byte) bool {
// This optimization avoids the need to recompute the capacity
// when generating code for p[8:], bringing it to parity with
// ValidString, which was 20% faster on long ASCII strings.
p = p[:len(p):len(p)]
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
for len(p) >= 8 {
// Combining two 32 bit loads allows the same code to be used
// for 32 and 64 bit platforms.
// The compiler can generate a 32bit load for first32 and second32
// on many platforms. See test/codegen/memcombine.go.
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
if (first32|second32)&0x80808080 != 0 {
// Found a non ASCII byte (>= RuneSelf).
break
}
p = p[8:]
}
n := len(p)
for i := 0; i < n; {
pi := p[i]
if pi < RuneSelf {
i++
continue
}
x := first[pi]
if x == xx {
return false // Illegal starter byte.
}
size := int(x & 7)
if i+size > n {
return false // Short or invalid.
}
accept := acceptRanges[x>>4]
if c := p[i+1]; c < accept.lo || accept.hi < c {
return false
} else if size == 2 {
} else if c := p[i+2]; c < locb || hicb < c {
return false
} else if size == 3 {
} else if c := p[i+3]; c < locb || hicb < c {
return false
}
i += size
}
return true
}
// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
func ValidString(s string) bool {
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
for len(s) >= 8 {
// Combining two 32 bit loads allows the same code to be used
// for 32 and 64 bit platforms.
// The compiler can generate a 32bit load for first32 and second32
// on many platforms. See test/codegen/memcombine.go.
first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
if (first32|second32)&0x80808080 != 0 {
// Found a non ASCII byte (>= RuneSelf).
break
}
s = s[8:]
}
n := len(s)
for i := 0; i < n; {
si := s[i]
if si < RuneSelf {
i++
continue
}
x := first[si]
if x == xx {
return false // Illegal starter byte.
}
size := int(x & 7)
if i+size > n {
return false // Short or invalid.
}
accept := acceptRanges[x>>4]
if c := s[i+1]; c < accept.lo || accept.hi < c {
return false
} else if size == 2 {
} else if c := s[i+2]; c < locb || hicb < c {
return false
} else if size == 3 {
} else if c := s[i+3]; c < locb || hicb < c {
return false
}
i += size
}
return true
}
// ValidRune reports whether r can be legally encoded as UTF-8.
// Code points that are out of range or a surrogate half are illegal.
func ValidRune(r rune) bool {
switch {
case 0 <= r && r < surrogateMin:
return true
case surrogateMax < r && r <= MaxRune:
return true
}
return false
}

View File

@@ -0,0 +1,703 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package utf8_test
import (
"bytes"
"strings"
"testing"
"unicode"
. "unicode/utf8"
)
// Validate the constants redefined from unicode.
func init() {
if MaxRune != unicode.MaxRune {
panic("utf8.MaxRune is wrong")
}
if RuneError != unicode.ReplacementChar {
panic("utf8.RuneError is wrong")
}
}
// Validate the constants redefined from unicode.
func TestConstants(t *testing.T) {
if MaxRune != unicode.MaxRune {
t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
}
if RuneError != unicode.ReplacementChar {
t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
}
}
type Utf8Map struct {
r rune
str string
}
var utf8map = []Utf8Map{
{0x0000, "\x00"},
{0x0001, "\x01"},
{0x007e, "\x7e"},
{0x007f, "\x7f"},
{0x0080, "\xc2\x80"},
{0x0081, "\xc2\x81"},
{0x00bf, "\xc2\xbf"},
{0x00c0, "\xc3\x80"},
{0x00c1, "\xc3\x81"},
{0x00c8, "\xc3\x88"},
{0x00d0, "\xc3\x90"},
{0x00e0, "\xc3\xa0"},
{0x00f0, "\xc3\xb0"},
{0x00f8, "\xc3\xb8"},
{0x00ff, "\xc3\xbf"},
{0x0100, "\xc4\x80"},
{0x07ff, "\xdf\xbf"},
{0x0400, "\xd0\x80"},
{0x0800, "\xe0\xa0\x80"},
{0x0801, "\xe0\xa0\x81"},
{0x1000, "\xe1\x80\x80"},
{0xd000, "\xed\x80\x80"},
{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
{0xfffe, "\xef\xbf\xbe"},
{0xffff, "\xef\xbf\xbf"},
{0x10000, "\xf0\x90\x80\x80"},
{0x10001, "\xf0\x90\x80\x81"},
{0x40000, "\xf1\x80\x80\x80"},
{0x10fffe, "\xf4\x8f\xbf\xbe"},
{0x10ffff, "\xf4\x8f\xbf\xbf"},
{0xFFFD, "\xef\xbf\xbd"},
}
var surrogateMap = []Utf8Map{
{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
}
var testStrings = []string{
"",
"abcd",
"☺☻☹",
"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
"\x80\x80\x80\x80",
}
func TestFullRune(t *testing.T) {
for _, m := range utf8map {
b := []byte(m.str)
if !FullRune(b) {
t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
}
s := m.str
if !FullRuneInString(s) {
t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
}
b1 := b[0 : len(b)-1]
if FullRune(b1) {
t.Errorf("FullRune(%q) = true, want false", b1)
}
s1 := string(b1)
if FullRuneInString(s1) {
t.Errorf("FullRune(%q) = true, want false", s1)
}
}
for _, s := range []string{"\xc0", "\xc1"} {
b := []byte(s)
if !FullRune(b) {
t.Errorf("FullRune(%q) = false, want true", s)
}
if !FullRuneInString(s) {
t.Errorf("FullRuneInString(%q) = false, want true", s)
}
}
}
func TestEncodeRune(t *testing.T) {
for _, m := range utf8map {
b := []byte(m.str)
var buf [10]byte
n := EncodeRune(buf[0:], m.r)
b1 := buf[0:n]
if !bytes.Equal(b, b1) {
t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
}
}
}
func TestAppendRune(t *testing.T) {
for _, m := range utf8map {
if buf := AppendRune(nil, m.r); string(buf) != m.str {
t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
}
if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
}
}
}
func TestDecodeRune(t *testing.T) {
for _, m := range utf8map {
b := []byte(m.str)
r, size := DecodeRune(b)
if r != m.r || size != len(b) {
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
}
s := m.str
r, size = DecodeRuneInString(s)
if r != m.r || size != len(b) {
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
}
// there's an extra byte that bytes left behind - make sure trailing byte works
r, size = DecodeRune(b[0:cap(b)])
if r != m.r || size != len(b) {
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
}
s = m.str + "\x00"
r, size = DecodeRuneInString(s)
if r != m.r || size != len(b) {
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
}
// make sure missing bytes fail
wantsize := 1
if wantsize >= len(b) {
wantsize = 0
}
r, size = DecodeRune(b[0 : len(b)-1])
if r != RuneError || size != wantsize {
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
}
s = m.str[0 : len(m.str)-1]
r, size = DecodeRuneInString(s)
if r != RuneError || size != wantsize {
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
}
// make sure bad sequences fail
if len(b) == 1 {
b[0] = 0x80
} else {
b[len(b)-1] = 0x7F
}
r, size = DecodeRune(b)
if r != RuneError || size != 1 {
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
}
s = string(b)
r, size = DecodeRuneInString(s)
if r != RuneError || size != 1 {
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
}
}
}
func TestDecodeSurrogateRune(t *testing.T) {
for _, m := range surrogateMap {
b := []byte(m.str)
r, size := DecodeRune(b)
if r != RuneError || size != 1 {
t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
}
s := m.str
r, size = DecodeRuneInString(s)
if r != RuneError || size != 1 {
t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
}
}
}
// Check that DecodeRune and DecodeLastRune correspond to
// the equivalent range loop.
func TestSequencing(t *testing.T) {
for _, ts := range testStrings {
for _, m := range utf8map {
for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
testSequence(t, s)
}
}
}
}
func runtimeRuneCount(s string) int {
return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s).
}
// Check that a range loop, len([]rune(string)) optimization and
// []rune conversions visit the same runes.
// Not really a test of this package, but the assumption is used here and
// it's good to verify.
func TestRuntimeConversion(t *testing.T) {
for _, ts := range testStrings {
count := RuneCountInString(ts)
if n := runtimeRuneCount(ts); n != count {
t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count)
break
}
runes := []rune(ts)
if n := len(runes); n != count {
t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count)
break
}
i := 0
for _, r := range ts {
if r != runes[i] {
t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
}
i++
}
}
}
var invalidSequenceTests = []string{
"\xed\xa0\x80\x80", // surrogate min
"\xed\xbf\xbf\x80", // surrogate max
// xx
"\x91\x80\x80\x80",
// s1
"\xC2\x7F\x80\x80",
"\xC2\xC0\x80\x80",
"\xDF\x7F\x80\x80",
"\xDF\xC0\x80\x80",
// s2
"\xE0\x9F\xBF\x80",
"\xE0\xA0\x7F\x80",
"\xE0\xBF\xC0\x80",
"\xE0\xC0\x80\x80",
// s3
"\xE1\x7F\xBF\x80",
"\xE1\x80\x7F\x80",
"\xE1\xBF\xC0\x80",
"\xE1\xC0\x80\x80",
//s4
"\xED\x7F\xBF\x80",
"\xED\x80\x7F\x80",
"\xED\x9F\xC0\x80",
"\xED\xA0\x80\x80",
// s5
"\xF0\x8F\xBF\xBF",
"\xF0\x90\x7F\xBF",
"\xF0\x90\x80\x7F",
"\xF0\xBF\xBF\xC0",
"\xF0\xBF\xC0\x80",
"\xF0\xC0\x80\x80",
// s6
"\xF1\x7F\xBF\xBF",
"\xF1\x80\x7F\xBF",
"\xF1\x80\x80\x7F",
"\xF1\xBF\xBF\xC0",
"\xF1\xBF\xC0\x80",
"\xF1\xC0\x80\x80",
// s7
"\xF4\x7F\xBF\xBF",
"\xF4\x80\x7F\xBF",
"\xF4\x80\x80\x7F",
"\xF4\x8F\xBF\xC0",
"\xF4\x8F\xC0\x80",
"\xF4\x90\x80\x80",
}
func runtimeDecodeRune(s string) rune {
for _, r := range s {
return r
}
return -1
}
func TestDecodeInvalidSequence(t *testing.T) {
for _, s := range invalidSequenceTests {
r1, _ := DecodeRune([]byte(s))
if want := RuneError; r1 != want {
t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
return
}
r2, _ := DecodeRuneInString(s)
if want := RuneError; r2 != want {
t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
return
}
if r1 != r2 {
t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
return
}
r3 := runtimeDecodeRune(s)
if r2 != r3 {
t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
return
}
}
}
func testSequence(t *testing.T, s string) {
type info struct {
index int
r rune
}
index := make([]info, len(s))
b := []byte(s)
si := 0
j := 0
for i, r := range s {
if si != i {
t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
return
}
index[j] = info{i, r}
j++
r1, size1 := DecodeRune(b[i:])
if r != r1 {
t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
return
}
r2, size2 := DecodeRuneInString(s[i:])
if r != r2 {
t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
return
}
if size1 != size2 {
t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
return
}
si += size1
}
j--
for si = len(s); si > 0; {
r1, size1 := DecodeLastRune(b[0:si])
r2, size2 := DecodeLastRuneInString(s[0:si])
if size1 != size2 {
t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
return
}
if r1 != index[j].r {
t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
return
}
if r2 != index[j].r {
t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
return
}
si -= size1
if si != index[j].index {
t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
return
}
j--
}
if si != 0 {
t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
}
}
// Check that negative runes encode as U+FFFD.
func TestNegativeRune(t *testing.T) {
errorbuf := make([]byte, UTFMax)
errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
buf := make([]byte, UTFMax)
buf = buf[0:EncodeRune(buf, -1)]
if !bytes.Equal(buf, errorbuf) {
t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
}
}
type RuneCountTest struct {
in string
out int
}
var runecounttests = []RuneCountTest{
{"abcd", 4},
{"☺☻☹", 3},
{"1,2,3,4", 7},
{"\xe2\x00", 2},
{"\xe2\x80", 2},
{"a\xe2\x80", 3},
}
func TestRuneCount(t *testing.T) {
for _, tt := range runecounttests {
if out := RuneCountInString(tt.in); out != tt.out {
t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
}
if out := RuneCount([]byte(tt.in)); out != tt.out {
t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
}
}
}
type RuneLenTest struct {
r rune
size int
}
var runelentests = []RuneLenTest{
{0, 1},
{'e', 1},
{'é', 2},
{'☺', 3},
{RuneError, 3},
{MaxRune, 4},
{0xD800, -1},
{0xDFFF, -1},
{MaxRune + 1, -1},
{-1, -1},
}
func TestRuneLen(t *testing.T) {
for _, tt := range runelentests {
if size := RuneLen(tt.r); size != tt.size {
t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
}
}
}
type ValidTest struct {
in string
out bool
}
var validTests = []ValidTest{
{"", true},
{"a", true},
{"abc", true},
{"Ж", true},
{"ЖЖ", true},
{"брэд-ЛГТМ", true},
{"☺☻☹", true},
{"aa\xe2", false},
{string([]byte{66, 250}), false},
{string([]byte{66, 250, 67}), false},
{"a\uFFFDb", true},
{string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
{string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
{string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
{string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
{string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
{string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic)
}
func TestValid(t *testing.T) {
for _, tt := range validTests {
if Valid([]byte(tt.in)) != tt.out {
t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
}
if ValidString(tt.in) != tt.out {
t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
}
}
}
type ValidRuneTest struct {
r rune
ok bool
}
var validrunetests = []ValidRuneTest{
{0, true},
{'e', true},
{'é', true},
{'☺', true},
{RuneError, true},
{MaxRune, true},
{0xD7FF, true},
{0xD800, false},
{0xDFFF, false},
{0xE000, true},
{MaxRune + 1, false},
{-1, false},
}
func TestValidRune(t *testing.T) {
for _, tt := range validrunetests {
if ok := ValidRune(tt.r); ok != tt.ok {
t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
}
}
}
func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
s := []byte("0123456789")
for i := 0; i < b.N; i++ {
RuneCount(s)
}
}
func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
s := []byte("日本語日本語日本語日")
for i := 0; i < b.N; i++ {
RuneCount(s)
}
}
func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
for i := 0; i < b.N; i++ {
RuneCountInString("0123456789")
}
}
func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
for i := 0; i < b.N; i++ {
RuneCountInString("日本語日本語日本語日")
}
}
var ascii100000 = strings.Repeat("0123456789", 10000)
func BenchmarkValidTenASCIIChars(b *testing.B) {
s := []byte("0123456789")
for i := 0; i < b.N; i++ {
Valid(s)
}
}
func BenchmarkValid100KASCIIChars(b *testing.B) {
s := []byte(ascii100000)
for i := 0; i < b.N; i++ {
Valid(s)
}
}
func BenchmarkValidTenJapaneseChars(b *testing.B) {
s := []byte("日本語日本語日本語日")
for i := 0; i < b.N; i++ {
Valid(s)
}
}
func BenchmarkValidLongMostlyASCII(b *testing.B) {
longMostlyASCII := []byte(longStringMostlyASCII)
for i := 0; i < b.N; i++ {
Valid(longMostlyASCII)
}
}
func BenchmarkValidLongJapanese(b *testing.B) {
longJapanese := []byte(longStringJapanese)
for i := 0; i < b.N; i++ {
Valid(longJapanese)
}
}
func BenchmarkValidStringTenASCIIChars(b *testing.B) {
for i := 0; i < b.N; i++ {
ValidString("0123456789")
}
}
func BenchmarkValidString100KASCIIChars(b *testing.B) {
for i := 0; i < b.N; i++ {
ValidString(ascii100000)
}
}
func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
for i := 0; i < b.N; i++ {
ValidString("日本語日本語日本語日")
}
}
func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
for i := 0; i < b.N; i++ {
ValidString(longStringMostlyASCII)
}
}
func BenchmarkValidStringLongJapanese(b *testing.B) {
for i := 0; i < b.N; i++ {
ValidString(longStringJapanese)
}
}
var longStringMostlyASCII string // ~100KB, ~97% ASCII
var longStringJapanese string // ~100KB, non-ASCII
func init() {
const japanese = "日本語日本語日本語日"
var b strings.Builder
for i := 0; b.Len() < 100_000; i++ {
if i%100 == 0 {
b.WriteString(japanese)
} else {
b.WriteString("0123456789")
}
}
longStringMostlyASCII = b.String()
longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
}
func BenchmarkEncodeASCIIRune(b *testing.B) {
buf := make([]byte, UTFMax)
for i := 0; i < b.N; i++ {
EncodeRune(buf, 'a')
}
}
func BenchmarkEncodeJapaneseRune(b *testing.B) {
buf := make([]byte, UTFMax)
for i := 0; i < b.N; i++ {
EncodeRune(buf, '本')
}
}
func BenchmarkAppendASCIIRune(b *testing.B) {
buf := make([]byte, UTFMax)
for i := 0; i < b.N; i++ {
AppendRune(buf[:0], 'a')
}
}
func BenchmarkAppendJapaneseRune(b *testing.B) {
buf := make([]byte, UTFMax)
for i := 0; i < b.N; i++ {
AppendRune(buf[:0], '本')
}
}
func BenchmarkDecodeASCIIRune(b *testing.B) {
a := []byte{'a'}
for i := 0; i < b.N; i++ {
DecodeRune(a)
}
}
func BenchmarkDecodeJapaneseRune(b *testing.B) {
nihon := []byte("本")
for i := 0; i < b.N; i++ {
DecodeRune(nihon)
}
}
// boolSink is used to reference the return value of benchmarked
// functions to avoid dead code elimination.
var boolSink bool
func BenchmarkFullRune(b *testing.B) {
benchmarks := []struct {
name string
data []byte
}{
{"ASCII", []byte("a")},
{"Incomplete", []byte("\xf0\x90\x80")},
{"Japanese", []byte("本")},
}
for _, bm := range benchmarks {
b.Run(bm.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
boolSink = FullRune(bm.data)
}
})
}
}