Initial commit: Go 1.23 release state
This commit is contained in:
20
src/unicode/casetables.go
Normal file
20
src/unicode/casetables.go
Normal file
@@ -0,0 +1,20 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// TODO: This file contains the special casing rules for Turkish and Azeri only.
|
||||
// It should encompass all the languages with special casing rules
|
||||
// and be generated automatically, but that requires some API
|
||||
// development first.
|
||||
|
||||
package unicode
|
||||
|
||||
var TurkishCase SpecialCase = _TurkishCase
|
||||
var _TurkishCase = SpecialCase{
|
||||
CaseRange{0x0049, 0x0049, d{0, 0x131 - 0x49, 0}},
|
||||
CaseRange{0x0069, 0x0069, d{0x130 - 0x69, 0, 0x130 - 0x69}},
|
||||
CaseRange{0x0130, 0x0130, d{0, 0x69 - 0x130, 0}},
|
||||
CaseRange{0x0131, 0x0131, d{0x49 - 0x131, 0, 0x49 - 0x131}},
|
||||
}
|
||||
|
||||
var AzeriCase SpecialCase = _TurkishCase
|
||||
13
src/unicode/digit.go
Normal file
13
src/unicode/digit.go
Normal file
@@ -0,0 +1,13 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode
|
||||
|
||||
// IsDigit reports whether the rune is a decimal digit.
|
||||
func IsDigit(r rune) bool {
|
||||
if r <= MaxLatin1 {
|
||||
return '0' <= r && r <= '9'
|
||||
}
|
||||
return isExcludingLatin(Digit, r)
|
||||
}
|
||||
126
src/unicode/digit_test.go
Normal file
126
src/unicode/digit_test.go
Normal file
@@ -0,0 +1,126 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
. "unicode"
|
||||
)
|
||||
|
||||
var testDigit = []rune{
|
||||
0x0030,
|
||||
0x0039,
|
||||
0x0661,
|
||||
0x06F1,
|
||||
0x07C9,
|
||||
0x0966,
|
||||
0x09EF,
|
||||
0x0A66,
|
||||
0x0AEF,
|
||||
0x0B66,
|
||||
0x0B6F,
|
||||
0x0BE6,
|
||||
0x0BEF,
|
||||
0x0C66,
|
||||
0x0CEF,
|
||||
0x0D66,
|
||||
0x0D6F,
|
||||
0x0E50,
|
||||
0x0E59,
|
||||
0x0ED0,
|
||||
0x0ED9,
|
||||
0x0F20,
|
||||
0x0F29,
|
||||
0x1040,
|
||||
0x1049,
|
||||
0x1090,
|
||||
0x1091,
|
||||
0x1099,
|
||||
0x17E0,
|
||||
0x17E9,
|
||||
0x1810,
|
||||
0x1819,
|
||||
0x1946,
|
||||
0x194F,
|
||||
0x19D0,
|
||||
0x19D9,
|
||||
0x1B50,
|
||||
0x1B59,
|
||||
0x1BB0,
|
||||
0x1BB9,
|
||||
0x1C40,
|
||||
0x1C49,
|
||||
0x1C50,
|
||||
0x1C59,
|
||||
0xA620,
|
||||
0xA629,
|
||||
0xA8D0,
|
||||
0xA8D9,
|
||||
0xA900,
|
||||
0xA909,
|
||||
0xAA50,
|
||||
0xAA59,
|
||||
0xFF10,
|
||||
0xFF19,
|
||||
0x104A1,
|
||||
0x1D7CE,
|
||||
}
|
||||
|
||||
var testLetter = []rune{
|
||||
0x0041,
|
||||
0x0061,
|
||||
0x00AA,
|
||||
0x00BA,
|
||||
0x00C8,
|
||||
0x00DB,
|
||||
0x00F9,
|
||||
0x02EC,
|
||||
0x0535,
|
||||
0x06E6,
|
||||
0x093D,
|
||||
0x0A15,
|
||||
0x0B99,
|
||||
0x0DC0,
|
||||
0x0EDD,
|
||||
0x1000,
|
||||
0x1200,
|
||||
0x1312,
|
||||
0x1401,
|
||||
0x1885,
|
||||
0x2C00,
|
||||
0xA800,
|
||||
0xF900,
|
||||
0xFA30,
|
||||
0xFFDA,
|
||||
0xFFDC,
|
||||
0x10000,
|
||||
0x10300,
|
||||
0x10400,
|
||||
0x20000,
|
||||
0x2F800,
|
||||
0x2FA1D,
|
||||
}
|
||||
|
||||
func TestDigit(t *testing.T) {
|
||||
for _, r := range testDigit {
|
||||
if !IsDigit(r) {
|
||||
t.Errorf("IsDigit(U+%04X) = false, want true", r)
|
||||
}
|
||||
}
|
||||
for _, r := range testLetter {
|
||||
if IsDigit(r) {
|
||||
t.Errorf("IsDigit(U+%04X) = true, want false", r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test that the special case in IsDigit agrees with the table
|
||||
func TestDigitOptimization(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
if Is(Digit, i) != IsDigit(i) {
|
||||
t.Errorf("IsDigit(U+%04X) disagrees with Is(Digit)", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
256
src/unicode/example_test.go
Normal file
256
src/unicode/example_test.go
Normal file
@@ -0,0 +1,256 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Functions starting with "Is" can be used to inspect which table of range a
|
||||
// rune belongs to. Note that runes may fit into more than one range.
|
||||
func Example_is() {
|
||||
|
||||
// constant with mixed type runes
|
||||
const mixed = "\b5Ὂg̀9! ℃ᾭG"
|
||||
for _, c := range mixed {
|
||||
fmt.Printf("For %q:\n", c)
|
||||
if unicode.IsControl(c) {
|
||||
fmt.Println("\tis control rune")
|
||||
}
|
||||
if unicode.IsDigit(c) {
|
||||
fmt.Println("\tis digit rune")
|
||||
}
|
||||
if unicode.IsGraphic(c) {
|
||||
fmt.Println("\tis graphic rune")
|
||||
}
|
||||
if unicode.IsLetter(c) {
|
||||
fmt.Println("\tis letter rune")
|
||||
}
|
||||
if unicode.IsLower(c) {
|
||||
fmt.Println("\tis lower case rune")
|
||||
}
|
||||
if unicode.IsMark(c) {
|
||||
fmt.Println("\tis mark rune")
|
||||
}
|
||||
if unicode.IsNumber(c) {
|
||||
fmt.Println("\tis number rune")
|
||||
}
|
||||
if unicode.IsPrint(c) {
|
||||
fmt.Println("\tis printable rune")
|
||||
}
|
||||
if !unicode.IsPrint(c) {
|
||||
fmt.Println("\tis not printable rune")
|
||||
}
|
||||
if unicode.IsPunct(c) {
|
||||
fmt.Println("\tis punct rune")
|
||||
}
|
||||
if unicode.IsSpace(c) {
|
||||
fmt.Println("\tis space rune")
|
||||
}
|
||||
if unicode.IsSymbol(c) {
|
||||
fmt.Println("\tis symbol rune")
|
||||
}
|
||||
if unicode.IsTitle(c) {
|
||||
fmt.Println("\tis title case rune")
|
||||
}
|
||||
if unicode.IsUpper(c) {
|
||||
fmt.Println("\tis upper case rune")
|
||||
}
|
||||
}
|
||||
|
||||
// Output:
|
||||
// For '\b':
|
||||
// is control rune
|
||||
// is not printable rune
|
||||
// For '5':
|
||||
// is digit rune
|
||||
// is graphic rune
|
||||
// is number rune
|
||||
// is printable rune
|
||||
// For 'Ὂ':
|
||||
// is graphic rune
|
||||
// is letter rune
|
||||
// is printable rune
|
||||
// is upper case rune
|
||||
// For 'g':
|
||||
// is graphic rune
|
||||
// is letter rune
|
||||
// is lower case rune
|
||||
// is printable rune
|
||||
// For '̀':
|
||||
// is graphic rune
|
||||
// is mark rune
|
||||
// is printable rune
|
||||
// For '9':
|
||||
// is digit rune
|
||||
// is graphic rune
|
||||
// is number rune
|
||||
// is printable rune
|
||||
// For '!':
|
||||
// is graphic rune
|
||||
// is printable rune
|
||||
// is punct rune
|
||||
// For ' ':
|
||||
// is graphic rune
|
||||
// is printable rune
|
||||
// is space rune
|
||||
// For '℃':
|
||||
// is graphic rune
|
||||
// is printable rune
|
||||
// is symbol rune
|
||||
// For 'ᾭ':
|
||||
// is graphic rune
|
||||
// is letter rune
|
||||
// is printable rune
|
||||
// is title case rune
|
||||
// For 'G':
|
||||
// is graphic rune
|
||||
// is letter rune
|
||||
// is printable rune
|
||||
// is upper case rune
|
||||
}
|
||||
|
||||
func ExampleSimpleFold() {
|
||||
fmt.Printf("%#U\n", unicode.SimpleFold('A')) // 'a'
|
||||
fmt.Printf("%#U\n", unicode.SimpleFold('a')) // 'A'
|
||||
fmt.Printf("%#U\n", unicode.SimpleFold('K')) // 'k'
|
||||
fmt.Printf("%#U\n", unicode.SimpleFold('k')) // '\u212A' (Kelvin symbol, K)
|
||||
fmt.Printf("%#U\n", unicode.SimpleFold('\u212A')) // 'K'
|
||||
fmt.Printf("%#U\n", unicode.SimpleFold('1')) // '1'
|
||||
|
||||
// Output:
|
||||
// U+0061 'a'
|
||||
// U+0041 'A'
|
||||
// U+006B 'k'
|
||||
// U+212A 'K'
|
||||
// U+004B 'K'
|
||||
// U+0031 '1'
|
||||
}
|
||||
|
||||
func ExampleTo() {
|
||||
const lcG = 'g'
|
||||
fmt.Printf("%#U\n", unicode.To(unicode.UpperCase, lcG))
|
||||
fmt.Printf("%#U\n", unicode.To(unicode.LowerCase, lcG))
|
||||
fmt.Printf("%#U\n", unicode.To(unicode.TitleCase, lcG))
|
||||
|
||||
const ucG = 'G'
|
||||
fmt.Printf("%#U\n", unicode.To(unicode.UpperCase, ucG))
|
||||
fmt.Printf("%#U\n", unicode.To(unicode.LowerCase, ucG))
|
||||
fmt.Printf("%#U\n", unicode.To(unicode.TitleCase, ucG))
|
||||
|
||||
// Output:
|
||||
// U+0047 'G'
|
||||
// U+0067 'g'
|
||||
// U+0047 'G'
|
||||
// U+0047 'G'
|
||||
// U+0067 'g'
|
||||
// U+0047 'G'
|
||||
}
|
||||
|
||||
func ExampleToLower() {
|
||||
const ucG = 'G'
|
||||
fmt.Printf("%#U\n", unicode.ToLower(ucG))
|
||||
|
||||
// Output:
|
||||
// U+0067 'g'
|
||||
}
|
||||
func ExampleToTitle() {
|
||||
const ucG = 'g'
|
||||
fmt.Printf("%#U\n", unicode.ToTitle(ucG))
|
||||
|
||||
// Output:
|
||||
// U+0047 'G'
|
||||
}
|
||||
|
||||
func ExampleToUpper() {
|
||||
const ucG = 'g'
|
||||
fmt.Printf("%#U\n", unicode.ToUpper(ucG))
|
||||
|
||||
// Output:
|
||||
// U+0047 'G'
|
||||
}
|
||||
|
||||
func ExampleSpecialCase() {
|
||||
t := unicode.TurkishCase
|
||||
|
||||
const lci = 'i'
|
||||
fmt.Printf("%#U\n", t.ToLower(lci))
|
||||
fmt.Printf("%#U\n", t.ToTitle(lci))
|
||||
fmt.Printf("%#U\n", t.ToUpper(lci))
|
||||
|
||||
const uci = 'İ'
|
||||
fmt.Printf("%#U\n", t.ToLower(uci))
|
||||
fmt.Printf("%#U\n", t.ToTitle(uci))
|
||||
fmt.Printf("%#U\n", t.ToUpper(uci))
|
||||
|
||||
// Output:
|
||||
// U+0069 'i'
|
||||
// U+0130 'İ'
|
||||
// U+0130 'İ'
|
||||
// U+0069 'i'
|
||||
// U+0130 'İ'
|
||||
// U+0130 'İ'
|
||||
}
|
||||
|
||||
func ExampleIsDigit() {
|
||||
fmt.Printf("%t\n", unicode.IsDigit('৩'))
|
||||
fmt.Printf("%t\n", unicode.IsDigit('A'))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleIsNumber() {
|
||||
fmt.Printf("%t\n", unicode.IsNumber('Ⅷ'))
|
||||
fmt.Printf("%t\n", unicode.IsNumber('A'))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleIsLetter() {
|
||||
fmt.Printf("%t\n", unicode.IsLetter('A'))
|
||||
fmt.Printf("%t\n", unicode.IsLetter('7'))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleIsLower() {
|
||||
fmt.Printf("%t\n", unicode.IsLower('a'))
|
||||
fmt.Printf("%t\n", unicode.IsLower('A'))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleIsUpper() {
|
||||
fmt.Printf("%t\n", unicode.IsUpper('A'))
|
||||
fmt.Printf("%t\n", unicode.IsUpper('a'))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleIsTitle() {
|
||||
fmt.Printf("%t\n", unicode.IsTitle('Dž'))
|
||||
fmt.Printf("%t\n", unicode.IsTitle('a'))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleIsSpace() {
|
||||
fmt.Printf("%t\n", unicode.IsSpace(' '))
|
||||
fmt.Printf("%t\n", unicode.IsSpace('\n'))
|
||||
fmt.Printf("%t\n", unicode.IsSpace('\t'))
|
||||
fmt.Printf("%t\n", unicode.IsSpace('a'))
|
||||
// Output:
|
||||
// true
|
||||
// true
|
||||
// true
|
||||
// false
|
||||
}
|
||||
146
src/unicode/graphic.go
Normal file
146
src/unicode/graphic.go
Normal file
@@ -0,0 +1,146 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode
|
||||
|
||||
// Bit masks for each code point under U+0100, for fast lookup.
|
||||
const (
|
||||
pC = 1 << iota // a control character.
|
||||
pP // a punctuation character.
|
||||
pN // a numeral.
|
||||
pS // a symbolic character.
|
||||
pZ // a spacing character.
|
||||
pLu // an upper-case letter.
|
||||
pLl // a lower-case letter.
|
||||
pp // a printable character according to Go's definition.
|
||||
pg = pp | pZ // a graphical character according to the Unicode definition.
|
||||
pLo = pLl | pLu // a letter that is neither upper nor lower case.
|
||||
pLmask = pLo
|
||||
)
|
||||
|
||||
// GraphicRanges defines the set of graphic characters according to Unicode.
|
||||
var GraphicRanges = []*RangeTable{
|
||||
L, M, N, P, S, Zs,
|
||||
}
|
||||
|
||||
// PrintRanges defines the set of printable characters according to Go.
|
||||
// ASCII space, U+0020, is handled separately.
|
||||
var PrintRanges = []*RangeTable{
|
||||
L, M, N, P, S,
|
||||
}
|
||||
|
||||
// IsGraphic reports whether the rune is defined as a Graphic by Unicode.
|
||||
// Such characters include letters, marks, numbers, punctuation, symbols, and
|
||||
// spaces, from categories [L], [M], [N], [P], [S], [Zs].
|
||||
func IsGraphic(r rune) bool {
|
||||
// We convert to uint32 to avoid the extra test for negative,
|
||||
// and in the index we convert to uint8 to avoid the range check.
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pg != 0
|
||||
}
|
||||
return In(r, GraphicRanges...)
|
||||
}
|
||||
|
||||
// IsPrint reports whether the rune is defined as printable by Go. Such
|
||||
// characters include letters, marks, numbers, punctuation, symbols, and the
|
||||
// ASCII space character, from categories [L], [M], [N], [P], [S] and the ASCII space
|
||||
// character. This categorization is the same as [IsGraphic] except that the
|
||||
// only spacing character is ASCII space, U+0020.
|
||||
func IsPrint(r rune) bool {
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pp != 0
|
||||
}
|
||||
return In(r, PrintRanges...)
|
||||
}
|
||||
|
||||
// IsOneOf reports whether the rune is a member of one of the ranges.
|
||||
// The function "In" provides a nicer signature and should be used in preference to IsOneOf.
|
||||
func IsOneOf(ranges []*RangeTable, r rune) bool {
|
||||
for _, inside := range ranges {
|
||||
if Is(inside, r) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// In reports whether the rune is a member of one of the ranges.
|
||||
func In(r rune, ranges ...*RangeTable) bool {
|
||||
for _, inside := range ranges {
|
||||
if Is(inside, r) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// IsControl reports whether the rune is a control character.
|
||||
// The [C] ([Other]) Unicode category includes more code points
|
||||
// such as surrogates; use [Is](C, r) to test for them.
|
||||
func IsControl(r rune) bool {
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pC != 0
|
||||
}
|
||||
// All control characters are < MaxLatin1.
|
||||
return false
|
||||
}
|
||||
|
||||
// IsLetter reports whether the rune is a letter (category [L]).
|
||||
func IsLetter(r rune) bool {
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&(pLmask) != 0
|
||||
}
|
||||
return isExcludingLatin(Letter, r)
|
||||
}
|
||||
|
||||
// IsMark reports whether the rune is a mark character (category [M]).
|
||||
func IsMark(r rune) bool {
|
||||
// There are no mark characters in Latin-1.
|
||||
return isExcludingLatin(Mark, r)
|
||||
}
|
||||
|
||||
// IsNumber reports whether the rune is a number (category [N]).
|
||||
func IsNumber(r rune) bool {
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pN != 0
|
||||
}
|
||||
return isExcludingLatin(Number, r)
|
||||
}
|
||||
|
||||
// IsPunct reports whether the rune is a Unicode punctuation character
|
||||
// (category [P]).
|
||||
func IsPunct(r rune) bool {
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pP != 0
|
||||
}
|
||||
return Is(Punct, r)
|
||||
}
|
||||
|
||||
// IsSpace reports whether the rune is a space character as defined
|
||||
// by Unicode's White Space property; in the Latin-1 space
|
||||
// this is
|
||||
//
|
||||
// '\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP).
|
||||
//
|
||||
// Other definitions of spacing characters are set by category
|
||||
// Z and property [Pattern_White_Space].
|
||||
func IsSpace(r rune) bool {
|
||||
// This property isn't the same as Z; special-case it.
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
switch r {
|
||||
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
return isExcludingLatin(White_Space, r)
|
||||
}
|
||||
|
||||
// IsSymbol reports whether the rune is a symbolic character.
|
||||
func IsSymbol(r rune) bool {
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pS != 0
|
||||
}
|
||||
return isExcludingLatin(Symbol, r)
|
||||
}
|
||||
122
src/unicode/graphic_test.go
Normal file
122
src/unicode/graphic_test.go
Normal file
@@ -0,0 +1,122 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
. "unicode"
|
||||
)
|
||||
|
||||
// Independently check that the special "Is" functions work
|
||||
// in the Latin-1 range through the property table.
|
||||
|
||||
func TestIsControlLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsControl(i)
|
||||
want := false
|
||||
switch {
|
||||
case 0x00 <= i && i <= 0x1F:
|
||||
want = true
|
||||
case 0x7F <= i && i <= 0x9F:
|
||||
want = true
|
||||
}
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsLetterLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsLetter(i)
|
||||
want := Is(Letter, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsUpperLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsUpper(i)
|
||||
want := Is(Upper, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsLowerLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsLower(i)
|
||||
want := Is(Lower, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNumberLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsNumber(i)
|
||||
want := Is(Number, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsPrintLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsPrint(i)
|
||||
want := In(i, PrintRanges...)
|
||||
if i == ' ' {
|
||||
want = true
|
||||
}
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsGraphicLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsGraphic(i)
|
||||
want := In(i, GraphicRanges...)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsPunctLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsPunct(i)
|
||||
want := Is(Punct, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSpaceLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsSpace(i)
|
||||
want := Is(White_Space, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSymbolLatin1(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
got := IsSymbol(i)
|
||||
want := Is(Symbol, i)
|
||||
if got != want {
|
||||
t.Errorf("%U incorrect: got %t; want %t", i, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
371
src/unicode/letter.go
Normal file
371
src/unicode/letter.go
Normal file
@@ -0,0 +1,371 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package unicode provides data and functions to test some properties of
|
||||
// Unicode code points.
|
||||
package unicode
|
||||
|
||||
const (
|
||||
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
|
||||
ReplacementChar = '\uFFFD' // Represents invalid code points.
|
||||
MaxASCII = '\u007F' // maximum ASCII value.
|
||||
MaxLatin1 = '\u00FF' // maximum Latin-1 value.
|
||||
)
|
||||
|
||||
// RangeTable defines a set of Unicode code points by listing the ranges of
|
||||
// code points within the set. The ranges are listed in two slices
|
||||
// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
|
||||
// The two slices must be in sorted order and non-overlapping.
|
||||
// Also, R32 should contain only values >= 0x10000 (1<<16).
|
||||
type RangeTable struct {
|
||||
R16 []Range16
|
||||
R32 []Range32
|
||||
LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
|
||||
}
|
||||
|
||||
// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
|
||||
// inclusive and has the specified stride.
|
||||
type Range16 struct {
|
||||
Lo uint16
|
||||
Hi uint16
|
||||
Stride uint16
|
||||
}
|
||||
|
||||
// Range32 represents of a range of Unicode code points and is used when one or
|
||||
// more of the values will not fit in 16 bits. The range runs from Lo to Hi
|
||||
// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
|
||||
type Range32 struct {
|
||||
Lo uint32
|
||||
Hi uint32
|
||||
Stride uint32
|
||||
}
|
||||
|
||||
// CaseRange represents a range of Unicode code points for simple (one
|
||||
// code point to one code point) case conversion.
|
||||
// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
|
||||
// are the number to add to the code point to reach the code point for a
|
||||
// different case for that character. They may be negative. If zero, it
|
||||
// means the character is in the corresponding case. There is a special
|
||||
// case representing sequences of alternating corresponding Upper and Lower
|
||||
// pairs. It appears with a fixed Delta of
|
||||
//
|
||||
// {UpperLower, UpperLower, UpperLower}
|
||||
//
|
||||
// The constant UpperLower has an otherwise impossible delta value.
|
||||
type CaseRange struct {
|
||||
Lo uint32
|
||||
Hi uint32
|
||||
Delta d
|
||||
}
|
||||
|
||||
// SpecialCase represents language-specific case mappings such as Turkish.
|
||||
// Methods of SpecialCase customize (by overriding) the standard mappings.
|
||||
type SpecialCase []CaseRange
|
||||
|
||||
// BUG(r): There is no mechanism for full case folding, that is, for
|
||||
// characters that involve multiple runes in the input or output.
|
||||
|
||||
// Indices into the Delta arrays inside CaseRanges for case mapping.
|
||||
const (
|
||||
UpperCase = iota
|
||||
LowerCase
|
||||
TitleCase
|
||||
MaxCase
|
||||
)
|
||||
|
||||
type d [MaxCase]rune // to make the CaseRanges text shorter
|
||||
|
||||
// If the Delta field of a [CaseRange] is UpperLower, it means
|
||||
// this CaseRange represents a sequence of the form (say)
|
||||
// [Upper] [Lower] [Upper] [Lower].
|
||||
const (
|
||||
UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
|
||||
)
|
||||
|
||||
// linearMax is the maximum size table for linear search for non-Latin1 rune.
|
||||
// Derived by running 'go test -calibrate'.
|
||||
const linearMax = 18
|
||||
|
||||
// is16 reports whether r is in the sorted slice of 16-bit ranges.
|
||||
func is16(ranges []Range16, r uint16) bool {
|
||||
if len(ranges) <= linearMax || r <= MaxLatin1 {
|
||||
for i := range ranges {
|
||||
range_ := &ranges[i]
|
||||
if r < range_.Lo {
|
||||
return false
|
||||
}
|
||||
if r <= range_.Hi {
|
||||
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// binary search over ranges
|
||||
lo := 0
|
||||
hi := len(ranges)
|
||||
for lo < hi {
|
||||
m := int(uint(lo+hi) >> 1)
|
||||
range_ := &ranges[m]
|
||||
if range_.Lo <= r && r <= range_.Hi {
|
||||
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
|
||||
}
|
||||
if r < range_.Lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// is32 reports whether r is in the sorted slice of 32-bit ranges.
|
||||
func is32(ranges []Range32, r uint32) bool {
|
||||
if len(ranges) <= linearMax {
|
||||
for i := range ranges {
|
||||
range_ := &ranges[i]
|
||||
if r < range_.Lo {
|
||||
return false
|
||||
}
|
||||
if r <= range_.Hi {
|
||||
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// binary search over ranges
|
||||
lo := 0
|
||||
hi := len(ranges)
|
||||
for lo < hi {
|
||||
m := int(uint(lo+hi) >> 1)
|
||||
range_ := ranges[m]
|
||||
if range_.Lo <= r && r <= range_.Hi {
|
||||
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
|
||||
}
|
||||
if r < range_.Lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Is reports whether the rune is in the specified table of ranges.
|
||||
func Is(rangeTab *RangeTable, r rune) bool {
|
||||
r16 := rangeTab.R16
|
||||
// Compare as uint32 to correctly handle negative runes.
|
||||
if len(r16) > 0 && uint32(r) <= uint32(r16[len(r16)-1].Hi) {
|
||||
return is16(r16, uint16(r))
|
||||
}
|
||||
r32 := rangeTab.R32
|
||||
if len(r32) > 0 && r >= rune(r32[0].Lo) {
|
||||
return is32(r32, uint32(r))
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
|
||||
r16 := rangeTab.R16
|
||||
// Compare as uint32 to correctly handle negative runes.
|
||||
if off := rangeTab.LatinOffset; len(r16) > off && uint32(r) <= uint32(r16[len(r16)-1].Hi) {
|
||||
return is16(r16[off:], uint16(r))
|
||||
}
|
||||
r32 := rangeTab.R32
|
||||
if len(r32) > 0 && r >= rune(r32[0].Lo) {
|
||||
return is32(r32, uint32(r))
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// IsUpper reports whether the rune is an upper case letter.
|
||||
func IsUpper(r rune) bool {
|
||||
// See comment in IsGraphic.
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pLmask == pLu
|
||||
}
|
||||
return isExcludingLatin(Upper, r)
|
||||
}
|
||||
|
||||
// IsLower reports whether the rune is a lower case letter.
|
||||
func IsLower(r rune) bool {
|
||||
// See comment in IsGraphic.
|
||||
if uint32(r) <= MaxLatin1 {
|
||||
return properties[uint8(r)]&pLmask == pLl
|
||||
}
|
||||
return isExcludingLatin(Lower, r)
|
||||
}
|
||||
|
||||
// IsTitle reports whether the rune is a title case letter.
|
||||
func IsTitle(r rune) bool {
|
||||
if r <= MaxLatin1 {
|
||||
return false
|
||||
}
|
||||
return isExcludingLatin(Title, r)
|
||||
}
|
||||
|
||||
// to maps the rune using the specified case mapping.
|
||||
// It additionally reports whether caseRange contained a mapping for r.
|
||||
func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) {
|
||||
if _case < 0 || MaxCase <= _case {
|
||||
return ReplacementChar, false // as reasonable an error as any
|
||||
}
|
||||
// binary search over ranges
|
||||
lo := 0
|
||||
hi := len(caseRange)
|
||||
for lo < hi {
|
||||
m := int(uint(lo+hi) >> 1)
|
||||
cr := caseRange[m]
|
||||
if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
|
||||
delta := cr.Delta[_case]
|
||||
if delta > MaxRune {
|
||||
// In an Upper-Lower sequence, which always starts with
|
||||
// an UpperCase letter, the real deltas always look like:
|
||||
// {0, 1, 0} UpperCase (Lower is next)
|
||||
// {-1, 0, -1} LowerCase (Upper, Title are previous)
|
||||
// The characters at even offsets from the beginning of the
|
||||
// sequence are upper case; the ones at odd offsets are lower.
|
||||
// The correct mapping can be done by clearing or setting the low
|
||||
// bit in the sequence offset.
|
||||
// The constants UpperCase and TitleCase are even while LowerCase
|
||||
// is odd so we take the low bit from _case.
|
||||
return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true
|
||||
}
|
||||
return r + delta, true
|
||||
}
|
||||
if r < rune(cr.Lo) {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return r, false
|
||||
}
|
||||
|
||||
// To maps the rune to the specified case: [UpperCase], [LowerCase], or [TitleCase].
|
||||
func To(_case int, r rune) rune {
|
||||
r, _ = to(_case, r, CaseRanges)
|
||||
return r
|
||||
}
|
||||
|
||||
// ToUpper maps the rune to upper case.
|
||||
func ToUpper(r rune) rune {
|
||||
if r <= MaxASCII {
|
||||
if 'a' <= r && r <= 'z' {
|
||||
r -= 'a' - 'A'
|
||||
}
|
||||
return r
|
||||
}
|
||||
return To(UpperCase, r)
|
||||
}
|
||||
|
||||
// ToLower maps the rune to lower case.
|
||||
func ToLower(r rune) rune {
|
||||
if r <= MaxASCII {
|
||||
if 'A' <= r && r <= 'Z' {
|
||||
r += 'a' - 'A'
|
||||
}
|
||||
return r
|
||||
}
|
||||
return To(LowerCase, r)
|
||||
}
|
||||
|
||||
// ToTitle maps the rune to title case.
|
||||
func ToTitle(r rune) rune {
|
||||
if r <= MaxASCII {
|
||||
if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
|
||||
r -= 'a' - 'A'
|
||||
}
|
||||
return r
|
||||
}
|
||||
return To(TitleCase, r)
|
||||
}
|
||||
|
||||
// ToUpper maps the rune to upper case giving priority to the special mapping.
|
||||
func (special SpecialCase) ToUpper(r rune) rune {
|
||||
r1, hadMapping := to(UpperCase, r, []CaseRange(special))
|
||||
if r1 == r && !hadMapping {
|
||||
r1 = ToUpper(r)
|
||||
}
|
||||
return r1
|
||||
}
|
||||
|
||||
// ToTitle maps the rune to title case giving priority to the special mapping.
|
||||
func (special SpecialCase) ToTitle(r rune) rune {
|
||||
r1, hadMapping := to(TitleCase, r, []CaseRange(special))
|
||||
if r1 == r && !hadMapping {
|
||||
r1 = ToTitle(r)
|
||||
}
|
||||
return r1
|
||||
}
|
||||
|
||||
// ToLower maps the rune to lower case giving priority to the special mapping.
|
||||
func (special SpecialCase) ToLower(r rune) rune {
|
||||
r1, hadMapping := to(LowerCase, r, []CaseRange(special))
|
||||
if r1 == r && !hadMapping {
|
||||
r1 = ToLower(r)
|
||||
}
|
||||
return r1
|
||||
}
|
||||
|
||||
// caseOrbit is defined in tables.go as []foldPair. Right now all the
|
||||
// entries fit in uint16, so use uint16. If that changes, compilation
|
||||
// will fail (the constants in the composite literal will not fit in uint16)
|
||||
// and the types here can change to uint32.
|
||||
type foldPair struct {
|
||||
From uint16
|
||||
To uint16
|
||||
}
|
||||
|
||||
// SimpleFold iterates over Unicode code points equivalent under
|
||||
// the Unicode-defined simple case folding. Among the code points
|
||||
// equivalent to rune (including rune itself), SimpleFold returns the
|
||||
// smallest rune > r if one exists, or else the smallest rune >= 0.
|
||||
// If r is not a valid Unicode code point, SimpleFold(r) returns r.
|
||||
//
|
||||
// For example:
|
||||
//
|
||||
// SimpleFold('A') = 'a'
|
||||
// SimpleFold('a') = 'A'
|
||||
//
|
||||
// SimpleFold('K') = 'k'
|
||||
// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
|
||||
// SimpleFold('\u212A') = 'K'
|
||||
//
|
||||
// SimpleFold('1') = '1'
|
||||
//
|
||||
// SimpleFold(-2) = -2
|
||||
func SimpleFold(r rune) rune {
|
||||
if r < 0 || r > MaxRune {
|
||||
return r
|
||||
}
|
||||
|
||||
if int(r) < len(asciiFold) {
|
||||
return rune(asciiFold[r])
|
||||
}
|
||||
|
||||
// Consult caseOrbit table for special cases.
|
||||
lo := 0
|
||||
hi := len(caseOrbit)
|
||||
for lo < hi {
|
||||
m := int(uint(lo+hi) >> 1)
|
||||
if rune(caseOrbit[m].From) < r {
|
||||
lo = m + 1
|
||||
} else {
|
||||
hi = m
|
||||
}
|
||||
}
|
||||
if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
|
||||
return rune(caseOrbit[lo].To)
|
||||
}
|
||||
|
||||
// No folding specified. This is a one- or two-element
|
||||
// equivalence class containing rune and ToLower(rune)
|
||||
// and ToUpper(rune) if they are different from rune.
|
||||
if l := ToLower(r); l != r {
|
||||
return l
|
||||
}
|
||||
return ToUpper(r)
|
||||
}
|
||||
644
src/unicode/letter_test.go
Normal file
644
src/unicode/letter_test.go
Normal file
@@ -0,0 +1,644 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode_test
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
. "unicode"
|
||||
)
|
||||
|
||||
var upperTest = []rune{
|
||||
0x41,
|
||||
0xc0,
|
||||
0xd8,
|
||||
0x100,
|
||||
0x139,
|
||||
0x14a,
|
||||
0x178,
|
||||
0x181,
|
||||
0x376,
|
||||
0x3cf,
|
||||
0x13bd,
|
||||
0x1f2a,
|
||||
0x2102,
|
||||
0x2c00,
|
||||
0x2c10,
|
||||
0x2c20,
|
||||
0xa650,
|
||||
0xa722,
|
||||
0xff3a,
|
||||
0x10400,
|
||||
0x1d400,
|
||||
0x1d7ca,
|
||||
}
|
||||
|
||||
var notupperTest = []rune{
|
||||
0x40,
|
||||
0x5b,
|
||||
0x61,
|
||||
0x185,
|
||||
0x1b0,
|
||||
0x377,
|
||||
0x387,
|
||||
0x2150,
|
||||
0xab7d,
|
||||
0xffff,
|
||||
0x10000,
|
||||
}
|
||||
|
||||
var letterTest = []rune{
|
||||
0x41,
|
||||
0x61,
|
||||
0xaa,
|
||||
0xba,
|
||||
0xc8,
|
||||
0xdb,
|
||||
0xf9,
|
||||
0x2ec,
|
||||
0x535,
|
||||
0x620,
|
||||
0x6e6,
|
||||
0x93d,
|
||||
0xa15,
|
||||
0xb99,
|
||||
0xdc0,
|
||||
0xedd,
|
||||
0x1000,
|
||||
0x1200,
|
||||
0x1312,
|
||||
0x1401,
|
||||
0x2c00,
|
||||
0xa800,
|
||||
0xf900,
|
||||
0xfa30,
|
||||
0xffda,
|
||||
0xffdc,
|
||||
0x10000,
|
||||
0x10300,
|
||||
0x10400,
|
||||
0x20000,
|
||||
0x2f800,
|
||||
0x2fa1d,
|
||||
}
|
||||
|
||||
var notletterTest = []rune{
|
||||
0x20,
|
||||
0x35,
|
||||
0x375,
|
||||
0x619,
|
||||
0x700,
|
||||
0x1885,
|
||||
0xfffe,
|
||||
0x1ffff,
|
||||
0x10ffff,
|
||||
}
|
||||
|
||||
// Contains all the special cased Latin-1 chars.
|
||||
var spaceTest = []rune{
|
||||
0x09,
|
||||
0x0a,
|
||||
0x0b,
|
||||
0x0c,
|
||||
0x0d,
|
||||
0x20,
|
||||
0x85,
|
||||
0xA0,
|
||||
0x2000,
|
||||
0x3000,
|
||||
}
|
||||
|
||||
type caseT struct {
|
||||
cas int
|
||||
in, out rune
|
||||
}
|
||||
|
||||
var caseTest = []caseT{
|
||||
// errors
|
||||
{-1, '\n', 0xFFFD},
|
||||
{UpperCase, -1, -1},
|
||||
{UpperCase, 1 << 30, 1 << 30},
|
||||
|
||||
// ASCII (special-cased so test carefully)
|
||||
{UpperCase, '\n', '\n'},
|
||||
{UpperCase, 'a', 'A'},
|
||||
{UpperCase, 'A', 'A'},
|
||||
{UpperCase, '7', '7'},
|
||||
{LowerCase, '\n', '\n'},
|
||||
{LowerCase, 'a', 'a'},
|
||||
{LowerCase, 'A', 'a'},
|
||||
{LowerCase, '7', '7'},
|
||||
{TitleCase, '\n', '\n'},
|
||||
{TitleCase, 'a', 'A'},
|
||||
{TitleCase, 'A', 'A'},
|
||||
{TitleCase, '7', '7'},
|
||||
|
||||
// Latin-1: easy to read the tests!
|
||||
{UpperCase, 0x80, 0x80},
|
||||
{UpperCase, 'Å', 'Å'},
|
||||
{UpperCase, 'å', 'Å'},
|
||||
{LowerCase, 0x80, 0x80},
|
||||
{LowerCase, 'Å', 'å'},
|
||||
{LowerCase, 'å', 'å'},
|
||||
{TitleCase, 0x80, 0x80},
|
||||
{TitleCase, 'Å', 'Å'},
|
||||
{TitleCase, 'å', 'Å'},
|
||||
|
||||
// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
|
||||
{UpperCase, 0x0131, 'I'},
|
||||
{LowerCase, 0x0131, 0x0131},
|
||||
{TitleCase, 0x0131, 'I'},
|
||||
|
||||
// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
|
||||
{UpperCase, 0x0133, 0x0132},
|
||||
{LowerCase, 0x0133, 0x0133},
|
||||
{TitleCase, 0x0133, 0x0132},
|
||||
|
||||
// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
|
||||
{UpperCase, 0x212A, 0x212A},
|
||||
{LowerCase, 0x212A, 'k'},
|
||||
{TitleCase, 0x212A, 0x212A},
|
||||
|
||||
// From an UpperLower sequence
|
||||
// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
|
||||
{UpperCase, 0xA640, 0xA640},
|
||||
{LowerCase, 0xA640, 0xA641},
|
||||
{TitleCase, 0xA640, 0xA640},
|
||||
// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
|
||||
{UpperCase, 0xA641, 0xA640},
|
||||
{LowerCase, 0xA641, 0xA641},
|
||||
{TitleCase, 0xA641, 0xA640},
|
||||
// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
|
||||
{UpperCase, 0xA64E, 0xA64E},
|
||||
{LowerCase, 0xA64E, 0xA64F},
|
||||
{TitleCase, 0xA64E, 0xA64E},
|
||||
// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
|
||||
{UpperCase, 0xA65F, 0xA65E},
|
||||
{LowerCase, 0xA65F, 0xA65F},
|
||||
{TitleCase, 0xA65F, 0xA65E},
|
||||
|
||||
// From another UpperLower sequence
|
||||
// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
|
||||
{UpperCase, 0x0139, 0x0139},
|
||||
{LowerCase, 0x0139, 0x013A},
|
||||
{TitleCase, 0x0139, 0x0139},
|
||||
// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
|
||||
{UpperCase, 0x013f, 0x013f},
|
||||
{LowerCase, 0x013f, 0x0140},
|
||||
{TitleCase, 0x013f, 0x013f},
|
||||
// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
|
||||
{UpperCase, 0x0148, 0x0147},
|
||||
{LowerCase, 0x0148, 0x0148},
|
||||
{TitleCase, 0x0148, 0x0147},
|
||||
|
||||
// Lowercase lower than uppercase.
|
||||
// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
|
||||
{UpperCase, 0xab78, 0x13a8},
|
||||
{LowerCase, 0xab78, 0xab78},
|
||||
{TitleCase, 0xab78, 0x13a8},
|
||||
{UpperCase, 0x13a8, 0x13a8},
|
||||
{LowerCase, 0x13a8, 0xab78},
|
||||
{TitleCase, 0x13a8, 0x13a8},
|
||||
|
||||
// Last block in the 5.1.0 table
|
||||
// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
|
||||
{UpperCase, 0x10400, 0x10400},
|
||||
{LowerCase, 0x10400, 0x10428},
|
||||
{TitleCase, 0x10400, 0x10400},
|
||||
// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
|
||||
{UpperCase, 0x10427, 0x10427},
|
||||
{LowerCase, 0x10427, 0x1044F},
|
||||
{TitleCase, 0x10427, 0x10427},
|
||||
// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
|
||||
{UpperCase, 0x10428, 0x10400},
|
||||
{LowerCase, 0x10428, 0x10428},
|
||||
{TitleCase, 0x10428, 0x10400},
|
||||
// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
|
||||
{UpperCase, 0x1044F, 0x10427},
|
||||
{LowerCase, 0x1044F, 0x1044F},
|
||||
{TitleCase, 0x1044F, 0x10427},
|
||||
|
||||
// First one not in the 5.1.0 table
|
||||
// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
|
||||
{UpperCase, 0x10450, 0x10450},
|
||||
{LowerCase, 0x10450, 0x10450},
|
||||
{TitleCase, 0x10450, 0x10450},
|
||||
|
||||
// Non-letters with case.
|
||||
{LowerCase, 0x2161, 0x2171},
|
||||
{UpperCase, 0x0345, 0x0399},
|
||||
}
|
||||
|
||||
func TestIsLetter(t *testing.T) {
|
||||
for _, r := range upperTest {
|
||||
if !IsLetter(r) {
|
||||
t.Errorf("IsLetter(U+%04X) = false, want true", r)
|
||||
}
|
||||
}
|
||||
for _, r := range letterTest {
|
||||
if !IsLetter(r) {
|
||||
t.Errorf("IsLetter(U+%04X) = false, want true", r)
|
||||
}
|
||||
}
|
||||
for _, r := range notletterTest {
|
||||
if IsLetter(r) {
|
||||
t.Errorf("IsLetter(U+%04X) = true, want false", r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsUpper(t *testing.T) {
|
||||
for _, r := range upperTest {
|
||||
if !IsUpper(r) {
|
||||
t.Errorf("IsUpper(U+%04X) = false, want true", r)
|
||||
}
|
||||
}
|
||||
for _, r := range notupperTest {
|
||||
if IsUpper(r) {
|
||||
t.Errorf("IsUpper(U+%04X) = true, want false", r)
|
||||
}
|
||||
}
|
||||
for _, r := range notletterTest {
|
||||
if IsUpper(r) {
|
||||
t.Errorf("IsUpper(U+%04X) = true, want false", r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func caseString(c int) string {
|
||||
switch c {
|
||||
case UpperCase:
|
||||
return "UpperCase"
|
||||
case LowerCase:
|
||||
return "LowerCase"
|
||||
case TitleCase:
|
||||
return "TitleCase"
|
||||
}
|
||||
return "ErrorCase"
|
||||
}
|
||||
|
||||
func TestTo(t *testing.T) {
|
||||
for _, c := range caseTest {
|
||||
r := To(c.cas, c.in)
|
||||
if c.out != r {
|
||||
t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestToUpperCase(t *testing.T) {
|
||||
for _, c := range caseTest {
|
||||
if c.cas != UpperCase {
|
||||
continue
|
||||
}
|
||||
r := ToUpper(c.in)
|
||||
if c.out != r {
|
||||
t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestToLowerCase(t *testing.T) {
|
||||
for _, c := range caseTest {
|
||||
if c.cas != LowerCase {
|
||||
continue
|
||||
}
|
||||
r := ToLower(c.in)
|
||||
if c.out != r {
|
||||
t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestToTitleCase(t *testing.T) {
|
||||
for _, c := range caseTest {
|
||||
if c.cas != TitleCase {
|
||||
continue
|
||||
}
|
||||
r := ToTitle(c.in)
|
||||
if c.out != r {
|
||||
t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSpace(t *testing.T) {
|
||||
for _, c := range spaceTest {
|
||||
if !IsSpace(c) {
|
||||
t.Errorf("IsSpace(U+%04X) = false; want true", c)
|
||||
}
|
||||
}
|
||||
for _, c := range letterTest {
|
||||
if IsSpace(c) {
|
||||
t.Errorf("IsSpace(U+%04X) = true; want false", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check that the optimizations for IsLetter etc. agree with the tables.
|
||||
// We only need to check the Latin-1 range.
|
||||
func TestLetterOptimizations(t *testing.T) {
|
||||
for i := rune(0); i <= MaxLatin1; i++ {
|
||||
if Is(Letter, i) != IsLetter(i) {
|
||||
t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i)
|
||||
}
|
||||
if Is(Upper, i) != IsUpper(i) {
|
||||
t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i)
|
||||
}
|
||||
if Is(Lower, i) != IsLower(i) {
|
||||
t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i)
|
||||
}
|
||||
if Is(Title, i) != IsTitle(i) {
|
||||
t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i)
|
||||
}
|
||||
if Is(White_Space, i) != IsSpace(i) {
|
||||
t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i)
|
||||
}
|
||||
if To(UpperCase, i) != ToUpper(i) {
|
||||
t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i)
|
||||
}
|
||||
if To(LowerCase, i) != ToLower(i) {
|
||||
t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i)
|
||||
}
|
||||
if To(TitleCase, i) != ToTitle(i) {
|
||||
t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTurkishCase(t *testing.T) {
|
||||
lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz")
|
||||
upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ")
|
||||
for i, l := range lower {
|
||||
u := upper[i]
|
||||
if TurkishCase.ToLower(l) != l {
|
||||
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
|
||||
}
|
||||
if TurkishCase.ToUpper(u) != u {
|
||||
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
|
||||
}
|
||||
if TurkishCase.ToUpper(l) != u {
|
||||
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
|
||||
}
|
||||
if TurkishCase.ToLower(u) != l {
|
||||
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
|
||||
}
|
||||
if TurkishCase.ToTitle(u) != u {
|
||||
t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
|
||||
}
|
||||
if TurkishCase.ToTitle(l) != u {
|
||||
t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var simpleFoldTests = []string{
|
||||
// SimpleFold(x) returns the next equivalent rune > x or wraps
|
||||
// around to smaller values.
|
||||
|
||||
// Easy cases.
|
||||
"Aa",
|
||||
"δΔ",
|
||||
|
||||
// ASCII special cases.
|
||||
"KkK",
|
||||
"Ssſ",
|
||||
|
||||
// Non-ASCII special cases.
|
||||
"ρϱΡ",
|
||||
"ͅΙιι",
|
||||
|
||||
// Extra special cases: has lower/upper but no case fold.
|
||||
"İ",
|
||||
"ı",
|
||||
|
||||
// Upper comes before lower (Cherokee).
|
||||
"\u13b0\uab80",
|
||||
}
|
||||
|
||||
func TestSimpleFold(t *testing.T) {
|
||||
for _, tt := range simpleFoldTests {
|
||||
cycle := []rune(tt)
|
||||
r := cycle[len(cycle)-1]
|
||||
for _, out := range cycle {
|
||||
if r := SimpleFold(r); r != out {
|
||||
t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out)
|
||||
}
|
||||
r = out
|
||||
}
|
||||
}
|
||||
|
||||
if r := SimpleFold(-42); r != -42 {
|
||||
t.Errorf("SimpleFold(-42) = %v, want -42", r)
|
||||
}
|
||||
}
|
||||
|
||||
// Running 'go test -calibrate' runs the calibration to find a plausible
|
||||
// cutoff point for linear search of a range list vs. binary search.
|
||||
// We create a fake table and then time how long it takes to do a
|
||||
// sequence of searches within that table, for all possible inputs
|
||||
// relative to the ranges (something before all, in each, between each, after all).
|
||||
// This assumes that all possible runes are equally likely.
|
||||
// In practice most runes are ASCII so this is a conservative estimate
|
||||
// of an effective cutoff value. In practice we could probably set it higher
|
||||
// than what this function recommends.
|
||||
|
||||
var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search")
|
||||
|
||||
func TestCalibrate(t *testing.T) {
|
||||
if !*calibrate {
|
||||
return
|
||||
}
|
||||
|
||||
if runtime.GOARCH == "amd64" {
|
||||
fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH)
|
||||
}
|
||||
|
||||
// Find the point where binary search wins by more than 10%.
|
||||
// The 10% bias gives linear search an edge when they're close,
|
||||
// because on predominantly ASCII inputs linear search is even
|
||||
// better than our benchmarks measure.
|
||||
n := sort.Search(64, func(n int) bool {
|
||||
tab := fakeTable(n)
|
||||
blinear := func(b *testing.B) {
|
||||
tab := tab
|
||||
max := n*5 + 20
|
||||
for i := 0; i < b.N; i++ {
|
||||
for j := 0; j <= max; j++ {
|
||||
linear(tab, uint16(j))
|
||||
}
|
||||
}
|
||||
}
|
||||
bbinary := func(b *testing.B) {
|
||||
tab := tab
|
||||
max := n*5 + 20
|
||||
for i := 0; i < b.N; i++ {
|
||||
for j := 0; j <= max; j++ {
|
||||
binary(tab, uint16(j))
|
||||
}
|
||||
}
|
||||
}
|
||||
bmlinear := testing.Benchmark(blinear)
|
||||
bmbinary := testing.Benchmark(bbinary)
|
||||
fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp())
|
||||
return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110
|
||||
})
|
||||
fmt.Printf("calibration: linear cutoff = %d\n", n)
|
||||
}
|
||||
|
||||
func fakeTable(n int) []Range16 {
|
||||
var r16 []Range16
|
||||
for i := 0; i < n; i++ {
|
||||
r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1})
|
||||
}
|
||||
return r16
|
||||
}
|
||||
|
||||
func linear(ranges []Range16, r uint16) bool {
|
||||
for i := range ranges {
|
||||
range_ := &ranges[i]
|
||||
if r < range_.Lo {
|
||||
return false
|
||||
}
|
||||
if r <= range_.Hi {
|
||||
return (r-range_.Lo)%range_.Stride == 0
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func binary(ranges []Range16, r uint16) bool {
|
||||
// binary search over ranges
|
||||
lo := 0
|
||||
hi := len(ranges)
|
||||
for lo < hi {
|
||||
m := int(uint(lo+hi) >> 1)
|
||||
range_ := &ranges[m]
|
||||
if range_.Lo <= r && r <= range_.Hi {
|
||||
return (r-range_.Lo)%range_.Stride == 0
|
||||
}
|
||||
if r < range_.Lo {
|
||||
hi = m
|
||||
} else {
|
||||
lo = m + 1
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func TestLatinOffset(t *testing.T) {
|
||||
var maps = []map[string]*RangeTable{
|
||||
Categories,
|
||||
FoldCategory,
|
||||
FoldScript,
|
||||
Properties,
|
||||
Scripts,
|
||||
}
|
||||
for _, m := range maps {
|
||||
for name, tab := range m {
|
||||
i := 0
|
||||
for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 {
|
||||
i++
|
||||
}
|
||||
if tab.LatinOffset != i {
|
||||
t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpecialCaseNoMapping(t *testing.T) {
|
||||
// Issue 25636
|
||||
// no change for rune 'A', zero delta, under upper/lower/title case change.
|
||||
var noChangeForCapitalA = CaseRange{'A', 'A', [MaxCase]rune{0, 0, 0}}
|
||||
got := strings.ToLowerSpecial(SpecialCase([]CaseRange{noChangeForCapitalA}), "ABC")
|
||||
want := "Abc"
|
||||
if got != want {
|
||||
t.Errorf("got %q; want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNegativeRune(t *testing.T) {
|
||||
// Issue 43254
|
||||
// These tests cover negative rune handling by testing values which,
|
||||
// when cast to uint8 or uint16, look like a particular valid rune.
|
||||
// This package has Latin-1-specific optimizations, so we test all of
|
||||
// Latin-1 and representative non-Latin-1 values in the character
|
||||
// categories covered by IsGraphic, etc.
|
||||
nonLatin1 := []uint32{
|
||||
// Lu: LATIN CAPITAL LETTER A WITH MACRON
|
||||
0x0100,
|
||||
// Ll: LATIN SMALL LETTER A WITH MACRON
|
||||
0x0101,
|
||||
// Lt: LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
|
||||
0x01C5,
|
||||
// M: COMBINING GRAVE ACCENT
|
||||
0x0300,
|
||||
// Nd: ARABIC-INDIC DIGIT ZERO
|
||||
0x0660,
|
||||
// P: GREEK QUESTION MARK
|
||||
0x037E,
|
||||
// S: MODIFIER LETTER LEFT ARROWHEAD
|
||||
0x02C2,
|
||||
// Z: OGHAM SPACE MARK
|
||||
0x1680,
|
||||
}
|
||||
for i := 0; i < MaxLatin1+len(nonLatin1); i++ {
|
||||
base := uint32(i)
|
||||
if i >= MaxLatin1 {
|
||||
base = nonLatin1[i-MaxLatin1]
|
||||
}
|
||||
|
||||
// Note r is negative, but uint8(r) == uint8(base) and
|
||||
// uint16(r) == uint16(base).
|
||||
r := rune(base - 1<<31)
|
||||
if Is(Letter, r) {
|
||||
t.Errorf("Is(Letter, 0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsControl(r) {
|
||||
t.Errorf("IsControl(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsDigit(r) {
|
||||
t.Errorf("IsDigit(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsGraphic(r) {
|
||||
t.Errorf("IsGraphic(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsLetter(r) {
|
||||
t.Errorf("IsLetter(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsLower(r) {
|
||||
t.Errorf("IsLower(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsMark(r) {
|
||||
t.Errorf("IsMark(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsNumber(r) {
|
||||
t.Errorf("IsNumber(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsPrint(r) {
|
||||
t.Errorf("IsPrint(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsPunct(r) {
|
||||
t.Errorf("IsPunct(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsSpace(r) {
|
||||
t.Errorf("IsSpace(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsSymbol(r) {
|
||||
t.Errorf("IsSymbol(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsTitle(r) {
|
||||
t.Errorf("IsTitle(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
if IsUpper(r) {
|
||||
t.Errorf("IsUpper(0x%x - 1<<31) = true, want false", base)
|
||||
}
|
||||
}
|
||||
}
|
||||
131
src/unicode/script_test.go
Normal file
131
src/unicode/script_test.go
Normal file
@@ -0,0 +1,131 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
. "unicode"
|
||||
)
|
||||
|
||||
type T struct {
|
||||
rune rune
|
||||
script string
|
||||
}
|
||||
|
||||
var inCategoryTest = []T{
|
||||
{0x0081, "Cc"},
|
||||
{0x200B, "Cf"},
|
||||
{0xf0000, "Co"},
|
||||
{0xdb80, "Cs"},
|
||||
{0x0236, "Ll"},
|
||||
{0x1d9d, "Lm"},
|
||||
{0x07cf, "Lo"},
|
||||
{0x1f8a, "Lt"},
|
||||
{0x03ff, "Lu"},
|
||||
{0x0bc1, "Mc"},
|
||||
{0x20df, "Me"},
|
||||
{0x07f0, "Mn"},
|
||||
{0x1bb2, "Nd"},
|
||||
{0x10147, "Nl"},
|
||||
{0x2478, "No"},
|
||||
{0xfe33, "Pc"},
|
||||
{0x2011, "Pd"},
|
||||
{0x301e, "Pe"},
|
||||
{0x2e03, "Pf"},
|
||||
{0x2e02, "Pi"},
|
||||
{0x0022, "Po"},
|
||||
{0x2770, "Ps"},
|
||||
{0x00a4, "Sc"},
|
||||
{0xa711, "Sk"},
|
||||
{0x25f9, "Sm"},
|
||||
{0x2108, "So"},
|
||||
{0x2028, "Zl"},
|
||||
{0x2029, "Zp"},
|
||||
{0x202f, "Zs"},
|
||||
// Unifieds.
|
||||
{0x04aa, "L"},
|
||||
{0x0009, "C"},
|
||||
{0x1712, "M"},
|
||||
{0x0031, "N"},
|
||||
{0x00bb, "P"},
|
||||
{0x00a2, "S"},
|
||||
{0x00a0, "Z"},
|
||||
}
|
||||
|
||||
var inPropTest = []T{
|
||||
{0x0046, "ASCII_Hex_Digit"},
|
||||
{0x200F, "Bidi_Control"},
|
||||
{0x2212, "Dash"},
|
||||
{0xE0001, "Deprecated"},
|
||||
{0x00B7, "Diacritic"},
|
||||
{0x30FE, "Extender"},
|
||||
{0xFF46, "Hex_Digit"},
|
||||
{0x2E17, "Hyphen"},
|
||||
{0x2FFB, "IDS_Binary_Operator"},
|
||||
{0x2FF3, "IDS_Trinary_Operator"},
|
||||
{0xFA6A, "Ideographic"},
|
||||
{0x200D, "Join_Control"},
|
||||
{0x0EC4, "Logical_Order_Exception"},
|
||||
{0x2FFFF, "Noncharacter_Code_Point"},
|
||||
{0x065E, "Other_Alphabetic"},
|
||||
{0x2065, "Other_Default_Ignorable_Code_Point"},
|
||||
{0x0BD7, "Other_Grapheme_Extend"},
|
||||
{0x0387, "Other_ID_Continue"},
|
||||
{0x212E, "Other_ID_Start"},
|
||||
{0x2094, "Other_Lowercase"},
|
||||
{0x2040, "Other_Math"},
|
||||
{0x216F, "Other_Uppercase"},
|
||||
{0x0027, "Pattern_Syntax"},
|
||||
{0x0020, "Pattern_White_Space"},
|
||||
{0x06DD, "Prepended_Concatenation_Mark"},
|
||||
{0x300D, "Quotation_Mark"},
|
||||
{0x2EF3, "Radical"},
|
||||
{0x1f1ff, "Regional_Indicator"},
|
||||
{0x061F, "STerm"}, // Deprecated alias of Sentence_Terminal
|
||||
{0x061F, "Sentence_Terminal"},
|
||||
{0x2071, "Soft_Dotted"},
|
||||
{0x003A, "Terminal_Punctuation"},
|
||||
{0x9FC3, "Unified_Ideograph"},
|
||||
{0xFE0F, "Variation_Selector"},
|
||||
{0x0020, "White_Space"},
|
||||
}
|
||||
|
||||
func TestCategories(t *testing.T) {
|
||||
notTested := make(map[string]bool)
|
||||
for k := range Categories {
|
||||
notTested[k] = true
|
||||
}
|
||||
for _, test := range inCategoryTest {
|
||||
if _, ok := Categories[test.script]; !ok {
|
||||
t.Fatal(test.script, "not a known category")
|
||||
}
|
||||
if !Is(Categories[test.script], test.rune) {
|
||||
t.Errorf("IsCategory(%U, %s) = false, want true", test.rune, test.script)
|
||||
}
|
||||
delete(notTested, test.script)
|
||||
}
|
||||
for k := range notTested {
|
||||
t.Error("category not tested:", k)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProperties(t *testing.T) {
|
||||
notTested := make(map[string]bool)
|
||||
for k := range Properties {
|
||||
notTested[k] = true
|
||||
}
|
||||
for _, test := range inPropTest {
|
||||
if _, ok := Properties[test.script]; !ok {
|
||||
t.Fatal(test.script, "not a known prop")
|
||||
}
|
||||
if !Is(Properties[test.script], test.rune) {
|
||||
t.Errorf("IsCategory(%U, %s) = false, want true", test.rune, test.script)
|
||||
}
|
||||
delete(notTested, test.script)
|
||||
}
|
||||
for k := range notTested {
|
||||
t.Error("property not tested:", k)
|
||||
}
|
||||
}
|
||||
8378
src/unicode/tables.go
Normal file
8378
src/unicode/tables.go
Normal file
File diff suppressed because it is too large
Load Diff
14
src/unicode/utf16/export_test.go
Normal file
14
src/unicode/utf16/export_test.go
Normal file
@@ -0,0 +1,14 @@
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package utf16
|
||||
|
||||
// Extra names for constants so we can validate them during testing.
|
||||
const (
|
||||
Surr1 = surr1
|
||||
Surr3 = surr3
|
||||
SurrSelf = surrSelf
|
||||
MaxRune = maxRune
|
||||
ReplacementChar = replacementChar
|
||||
)
|
||||
144
src/unicode/utf16/utf16.go
Normal file
144
src/unicode/utf16/utf16.go
Normal file
@@ -0,0 +1,144 @@
|
||||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package utf16 implements encoding and decoding of UTF-16 sequences.
|
||||
package utf16
|
||||
|
||||
// The conditions replacementChar==unicode.ReplacementChar and
|
||||
// maxRune==unicode.MaxRune are verified in the tests.
|
||||
// Defining them locally avoids this package depending on package unicode.
|
||||
|
||||
const (
|
||||
replacementChar = '\uFFFD' // Unicode replacement character
|
||||
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
|
||||
)
|
||||
|
||||
const (
|
||||
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
|
||||
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
|
||||
// the value is those 20 bits plus 0x10000.
|
||||
surr1 = 0xd800
|
||||
surr2 = 0xdc00
|
||||
surr3 = 0xe000
|
||||
|
||||
surrSelf = 0x10000
|
||||
)
|
||||
|
||||
// IsSurrogate reports whether the specified Unicode code point
|
||||
// can appear in a surrogate pair.
|
||||
func IsSurrogate(r rune) bool {
|
||||
return surr1 <= r && r < surr3
|
||||
}
|
||||
|
||||
// DecodeRune returns the UTF-16 decoding of a surrogate pair.
|
||||
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
|
||||
// the Unicode replacement code point U+FFFD.
|
||||
func DecodeRune(r1, r2 rune) rune {
|
||||
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
|
||||
return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
|
||||
}
|
||||
return replacementChar
|
||||
}
|
||||
|
||||
// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
|
||||
// If the rune is not a valid Unicode code point or does not need encoding,
|
||||
// EncodeRune returns U+FFFD, U+FFFD.
|
||||
func EncodeRune(r rune) (r1, r2 rune) {
|
||||
if r < surrSelf || r > maxRune {
|
||||
return replacementChar, replacementChar
|
||||
}
|
||||
r -= surrSelf
|
||||
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
|
||||
}
|
||||
|
||||
// RuneLen returns the number of 16-bit words in the UTF-16 encoding of the rune.
|
||||
// It returns -1 if the rune is not a valid value to encode in UTF-16.
|
||||
func RuneLen(r rune) int {
|
||||
switch {
|
||||
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
|
||||
return 1
|
||||
case surrSelf <= r && r <= maxRune:
|
||||
return 2
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
|
||||
// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
|
||||
func Encode(s []rune) []uint16 {
|
||||
n := len(s)
|
||||
for _, v := range s {
|
||||
if v >= surrSelf {
|
||||
n++
|
||||
}
|
||||
}
|
||||
|
||||
a := make([]uint16, n)
|
||||
n = 0
|
||||
for _, v := range s {
|
||||
switch RuneLen(v) {
|
||||
case 1: // normal rune
|
||||
a[n] = uint16(v)
|
||||
n++
|
||||
case 2: // needs surrogate sequence
|
||||
r1, r2 := EncodeRune(v)
|
||||
a[n] = uint16(r1)
|
||||
a[n+1] = uint16(r2)
|
||||
n += 2
|
||||
default:
|
||||
a[n] = uint16(replacementChar)
|
||||
n++
|
||||
}
|
||||
}
|
||||
return a[:n]
|
||||
}
|
||||
|
||||
// AppendRune appends the UTF-16 encoding of the Unicode code point r
|
||||
// to the end of p and returns the extended buffer. If the rune is not
|
||||
// a valid Unicode code point, it appends the encoding of U+FFFD.
|
||||
func AppendRune(a []uint16, r rune) []uint16 {
|
||||
// This function is inlineable for fast handling of ASCII.
|
||||
switch {
|
||||
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
|
||||
// normal rune
|
||||
return append(a, uint16(r))
|
||||
case surrSelf <= r && r <= maxRune:
|
||||
// needs surrogate sequence
|
||||
r1, r2 := EncodeRune(r)
|
||||
return append(a, uint16(r1), uint16(r2))
|
||||
}
|
||||
return append(a, replacementChar)
|
||||
}
|
||||
|
||||
// Decode returns the Unicode code point sequence represented
|
||||
// by the UTF-16 encoding s.
|
||||
func Decode(s []uint16) []rune {
|
||||
// Preallocate capacity to hold up to 64 runes.
|
||||
// Decode inlines, so the allocation can live on the stack.
|
||||
buf := make([]rune, 0, 64)
|
||||
return decode(s, buf)
|
||||
}
|
||||
|
||||
// decode appends to buf the Unicode code point sequence represented
|
||||
// by the UTF-16 encoding s and return the extended buffer.
|
||||
func decode(s []uint16, buf []rune) []rune {
|
||||
for i := 0; i < len(s); i++ {
|
||||
var ar rune
|
||||
switch r := s[i]; {
|
||||
case r < surr1, surr3 <= r:
|
||||
// normal rune
|
||||
ar = rune(r)
|
||||
case surr1 <= r && r < surr2 && i+1 < len(s) &&
|
||||
surr2 <= s[i+1] && s[i+1] < surr3:
|
||||
// valid surrogate sequence
|
||||
ar = DecodeRune(rune(r), rune(s[i+1]))
|
||||
i++
|
||||
default:
|
||||
// invalid surrogate sequence
|
||||
ar = replacementChar
|
||||
}
|
||||
buf = append(buf, ar)
|
||||
}
|
||||
return buf
|
||||
}
|
||||
273
src/unicode/utf16/utf16_test.go
Normal file
273
src/unicode/utf16/utf16_test.go
Normal file
@@ -0,0 +1,273 @@
|
||||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package utf16_test
|
||||
|
||||
import (
|
||||
"internal/testenv"
|
||||
"reflect"
|
||||
"testing"
|
||||
"unicode"
|
||||
. "unicode/utf16"
|
||||
)
|
||||
|
||||
// Validate the constants redefined from unicode.
|
||||
func TestConstants(t *testing.T) {
|
||||
if MaxRune != unicode.MaxRune {
|
||||
t.Errorf("utf16.maxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
|
||||
}
|
||||
if ReplacementChar != unicode.ReplacementChar {
|
||||
t.Errorf("utf16.replacementChar is wrong: %x should be %x", ReplacementChar, unicode.ReplacementChar)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuneLen(t *testing.T) {
|
||||
for _, tt := range []struct {
|
||||
r rune
|
||||
length int
|
||||
}{
|
||||
{0, 1},
|
||||
{Surr1 - 1, 1},
|
||||
{Surr3, 1},
|
||||
{SurrSelf - 1, 1},
|
||||
{SurrSelf, 2},
|
||||
{MaxRune, 2},
|
||||
{MaxRune + 1, -1},
|
||||
{-1, -1},
|
||||
} {
|
||||
if length := RuneLen(tt.r); length != tt.length {
|
||||
t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, length, tt.length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type encodeTest struct {
|
||||
in []rune
|
||||
out []uint16
|
||||
}
|
||||
|
||||
var encodeTests = []encodeTest{
|
||||
{[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}},
|
||||
{[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
|
||||
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff}},
|
||||
{[]rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1},
|
||||
[]uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd}},
|
||||
}
|
||||
|
||||
func TestEncode(t *testing.T) {
|
||||
for _, tt := range encodeTests {
|
||||
out := Encode(tt.in)
|
||||
if !reflect.DeepEqual(out, tt.out) {
|
||||
t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendRune(t *testing.T) {
|
||||
for _, tt := range encodeTests {
|
||||
var out []uint16
|
||||
for _, u := range tt.in {
|
||||
out = AppendRune(out, u)
|
||||
}
|
||||
if !reflect.DeepEqual(out, tt.out) {
|
||||
t.Errorf("AppendRune(%x) = %x; want %x", tt.in, out, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEncodeRune(t *testing.T) {
|
||||
for i, tt := range encodeTests {
|
||||
j := 0
|
||||
for _, r := range tt.in {
|
||||
r1, r2 := EncodeRune(r)
|
||||
if r < 0x10000 || r > unicode.MaxRune {
|
||||
if j >= len(tt.out) {
|
||||
t.Errorf("#%d: ran out of tt.out", i)
|
||||
break
|
||||
}
|
||||
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
|
||||
t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
|
||||
}
|
||||
j++
|
||||
} else {
|
||||
if j+1 >= len(tt.out) {
|
||||
t.Errorf("#%d: ran out of tt.out", i)
|
||||
break
|
||||
}
|
||||
if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) {
|
||||
t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
|
||||
}
|
||||
j += 2
|
||||
dec := DecodeRune(r1, r2)
|
||||
if dec != r {
|
||||
t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
|
||||
}
|
||||
}
|
||||
}
|
||||
if j != len(tt.out) {
|
||||
t.Errorf("#%d: EncodeRune didn't generate enough output", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type decodeTest struct {
|
||||
in []uint16
|
||||
out []rune
|
||||
}
|
||||
|
||||
var decodeTests = []decodeTest{
|
||||
{[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}},
|
||||
{[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
|
||||
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff}},
|
||||
{[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}},
|
||||
{[]uint16{0xdfff}, []rune{0xfffd}},
|
||||
}
|
||||
|
||||
func TestAllocationsDecode(t *testing.T) {
|
||||
testenv.SkipIfOptimizationOff(t)
|
||||
|
||||
for _, tt := range decodeTests {
|
||||
allocs := testing.AllocsPerRun(10, func() {
|
||||
out := Decode(tt.in)
|
||||
if out == nil {
|
||||
t.Errorf("Decode(%x) = nil", tt.in)
|
||||
}
|
||||
})
|
||||
if allocs > 0 {
|
||||
t.Errorf("Decode allocated %v times", allocs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecode(t *testing.T) {
|
||||
for _, tt := range decodeTests {
|
||||
out := Decode(tt.in)
|
||||
if !reflect.DeepEqual(out, tt.out) {
|
||||
t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var decodeRuneTests = []struct {
|
||||
r1, r2 rune
|
||||
want rune
|
||||
}{
|
||||
{0xd800, 0xdc00, 0x10000},
|
||||
{0xd800, 0xdc01, 0x10001},
|
||||
{0xd808, 0xdf45, 0x12345},
|
||||
{0xdbff, 0xdfff, 0x10ffff},
|
||||
{0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted
|
||||
}
|
||||
|
||||
func TestDecodeRune(t *testing.T) {
|
||||
for i, tt := range decodeRuneTests {
|
||||
got := DecodeRune(tt.r1, tt.r2)
|
||||
if got != tt.want {
|
||||
t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var surrogateTests = []struct {
|
||||
r rune
|
||||
want bool
|
||||
}{
|
||||
// from https://en.wikipedia.org/wiki/UTF-16
|
||||
{'\u007A', false}, // LATIN SMALL LETTER Z
|
||||
{'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water)
|
||||
{'\uFEFF', false}, // Byte Order Mark
|
||||
{'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point)
|
||||
{'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF
|
||||
{'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point)
|
||||
|
||||
{rune(0xd7ff), false}, // surr1-1
|
||||
{rune(0xd800), true}, // surr1
|
||||
{rune(0xdc00), true}, // surr2
|
||||
{rune(0xe000), false}, // surr3
|
||||
{rune(0xdfff), true}, // surr3-1
|
||||
}
|
||||
|
||||
func TestIsSurrogate(t *testing.T) {
|
||||
for i, tt := range surrogateTests {
|
||||
got := IsSurrogate(tt.r)
|
||||
if got != tt.want {
|
||||
t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeValidASCII(b *testing.B) {
|
||||
// "hello world"
|
||||
data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100}
|
||||
for i := 0; i < b.N; i++ {
|
||||
Decode(data)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeValidJapaneseChars(b *testing.B) {
|
||||
// "日本語日本語日本語"
|
||||
data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486}
|
||||
for i := 0; i < b.N; i++ {
|
||||
Decode(data)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeRune(b *testing.B) {
|
||||
rs := make([]rune, 10)
|
||||
// U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS
|
||||
for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
|
||||
rs[2*i], rs[2*i+1] = EncodeRune(u)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for j := 0; j < 5; j++ {
|
||||
DecodeRune(rs[2*j], rs[2*j+1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEncodeValidASCII(b *testing.B) {
|
||||
data := []rune{'h', 'e', 'l', 'l', 'o'}
|
||||
for i := 0; i < b.N; i++ {
|
||||
Encode(data)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEncodeValidJapaneseChars(b *testing.B) {
|
||||
data := []rune{'日', '本', '語'}
|
||||
for i := 0; i < b.N; i++ {
|
||||
Encode(data)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAppendRuneValidASCII(b *testing.B) {
|
||||
data := []rune{'h', 'e', 'l', 'l', 'o'}
|
||||
a := make([]uint16, 0, len(data)*2)
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, u := range data {
|
||||
a = AppendRune(a, u)
|
||||
}
|
||||
a = a[:0]
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAppendRuneValidJapaneseChars(b *testing.B) {
|
||||
data := []rune{'日', '本', '語'}
|
||||
a := make([]uint16, 0, len(data)*2)
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, u := range data {
|
||||
a = AppendRune(a, u)
|
||||
}
|
||||
a = a[:0]
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEncodeRune(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
|
||||
EncodeRune(u)
|
||||
}
|
||||
}
|
||||
}
|
||||
226
src/unicode/utf8/example_test.go
Normal file
226
src/unicode/utf8/example_test.go
Normal file
@@ -0,0 +1,226 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package utf8_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func ExampleDecodeLastRune() {
|
||||
b := []byte("Hello, 世界")
|
||||
|
||||
for len(b) > 0 {
|
||||
r, size := utf8.DecodeLastRune(b)
|
||||
fmt.Printf("%c %v\n", r, size)
|
||||
|
||||
b = b[:len(b)-size]
|
||||
}
|
||||
// Output:
|
||||
// 界 3
|
||||
// 世 3
|
||||
// 1
|
||||
// , 1
|
||||
// o 1
|
||||
// l 1
|
||||
// l 1
|
||||
// e 1
|
||||
// H 1
|
||||
}
|
||||
|
||||
func ExampleDecodeLastRuneInString() {
|
||||
str := "Hello, 世界"
|
||||
|
||||
for len(str) > 0 {
|
||||
r, size := utf8.DecodeLastRuneInString(str)
|
||||
fmt.Printf("%c %v\n", r, size)
|
||||
|
||||
str = str[:len(str)-size]
|
||||
}
|
||||
// Output:
|
||||
// 界 3
|
||||
// 世 3
|
||||
// 1
|
||||
// , 1
|
||||
// o 1
|
||||
// l 1
|
||||
// l 1
|
||||
// e 1
|
||||
// H 1
|
||||
|
||||
}
|
||||
|
||||
func ExampleDecodeRune() {
|
||||
b := []byte("Hello, 世界")
|
||||
|
||||
for len(b) > 0 {
|
||||
r, size := utf8.DecodeRune(b)
|
||||
fmt.Printf("%c %v\n", r, size)
|
||||
|
||||
b = b[size:]
|
||||
}
|
||||
// Output:
|
||||
// H 1
|
||||
// e 1
|
||||
// l 1
|
||||
// l 1
|
||||
// o 1
|
||||
// , 1
|
||||
// 1
|
||||
// 世 3
|
||||
// 界 3
|
||||
}
|
||||
|
||||
func ExampleDecodeRuneInString() {
|
||||
str := "Hello, 世界"
|
||||
|
||||
for len(str) > 0 {
|
||||
r, size := utf8.DecodeRuneInString(str)
|
||||
fmt.Printf("%c %v\n", r, size)
|
||||
|
||||
str = str[size:]
|
||||
}
|
||||
// Output:
|
||||
// H 1
|
||||
// e 1
|
||||
// l 1
|
||||
// l 1
|
||||
// o 1
|
||||
// , 1
|
||||
// 1
|
||||
// 世 3
|
||||
// 界 3
|
||||
}
|
||||
|
||||
func ExampleEncodeRune() {
|
||||
r := '世'
|
||||
buf := make([]byte, 3)
|
||||
|
||||
n := utf8.EncodeRune(buf, r)
|
||||
|
||||
fmt.Println(buf)
|
||||
fmt.Println(n)
|
||||
// Output:
|
||||
// [228 184 150]
|
||||
// 3
|
||||
}
|
||||
|
||||
func ExampleEncodeRune_outOfRange() {
|
||||
runes := []rune{
|
||||
// Less than 0, out of range.
|
||||
-1,
|
||||
// Greater than 0x10FFFF, out of range.
|
||||
0x110000,
|
||||
// The Unicode replacement character.
|
||||
utf8.RuneError,
|
||||
}
|
||||
for i, c := range runes {
|
||||
buf := make([]byte, 3)
|
||||
size := utf8.EncodeRune(buf, c)
|
||||
fmt.Printf("%d: %d %[2]s %d\n", i, buf, size)
|
||||
}
|
||||
// Output:
|
||||
// 0: [239 191 189] <20> 3
|
||||
// 1: [239 191 189] <20> 3
|
||||
// 2: [239 191 189] <20> 3
|
||||
}
|
||||
|
||||
func ExampleFullRune() {
|
||||
buf := []byte{228, 184, 150} // 世
|
||||
fmt.Println(utf8.FullRune(buf))
|
||||
fmt.Println(utf8.FullRune(buf[:2]))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleFullRuneInString() {
|
||||
str := "世"
|
||||
fmt.Println(utf8.FullRuneInString(str))
|
||||
fmt.Println(utf8.FullRuneInString(str[:2]))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleRuneCount() {
|
||||
buf := []byte("Hello, 世界")
|
||||
fmt.Println("bytes =", len(buf))
|
||||
fmt.Println("runes =", utf8.RuneCount(buf))
|
||||
// Output:
|
||||
// bytes = 13
|
||||
// runes = 9
|
||||
}
|
||||
|
||||
func ExampleRuneCountInString() {
|
||||
str := "Hello, 世界"
|
||||
fmt.Println("bytes =", len(str))
|
||||
fmt.Println("runes =", utf8.RuneCountInString(str))
|
||||
// Output:
|
||||
// bytes = 13
|
||||
// runes = 9
|
||||
}
|
||||
|
||||
func ExampleRuneLen() {
|
||||
fmt.Println(utf8.RuneLen('a'))
|
||||
fmt.Println(utf8.RuneLen('界'))
|
||||
// Output:
|
||||
// 1
|
||||
// 3
|
||||
}
|
||||
|
||||
func ExampleRuneStart() {
|
||||
buf := []byte("a界")
|
||||
fmt.Println(utf8.RuneStart(buf[0]))
|
||||
fmt.Println(utf8.RuneStart(buf[1]))
|
||||
fmt.Println(utf8.RuneStart(buf[2]))
|
||||
// Output:
|
||||
// true
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleValid() {
|
||||
valid := []byte("Hello, 世界")
|
||||
invalid := []byte{0xff, 0xfe, 0xfd}
|
||||
|
||||
fmt.Println(utf8.Valid(valid))
|
||||
fmt.Println(utf8.Valid(invalid))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleValidRune() {
|
||||
valid := 'a'
|
||||
invalid := rune(0xfffffff)
|
||||
|
||||
fmt.Println(utf8.ValidRune(valid))
|
||||
fmt.Println(utf8.ValidRune(invalid))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleValidString() {
|
||||
valid := "Hello, 世界"
|
||||
invalid := string([]byte{0xff, 0xfe, 0xfd})
|
||||
|
||||
fmt.Println(utf8.ValidString(valid))
|
||||
fmt.Println(utf8.ValidString(invalid))
|
||||
// Output:
|
||||
// true
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleAppendRune() {
|
||||
buf1 := utf8.AppendRune(nil, 0x10000)
|
||||
buf2 := utf8.AppendRune([]byte("init"), 0x10000)
|
||||
fmt.Println(string(buf1))
|
||||
fmt.Println(string(buf2))
|
||||
// Output:
|
||||
// 𐀀
|
||||
// init𐀀
|
||||
}
|
||||
583
src/unicode/utf8/utf8.go
Normal file
583
src/unicode/utf8/utf8.go
Normal file
@@ -0,0 +1,583 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package utf8 implements functions and constants to support text encoded in
|
||||
// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
|
||||
// See https://en.wikipedia.org/wiki/UTF-8
|
||||
package utf8
|
||||
|
||||
// The conditions RuneError==unicode.ReplacementChar and
|
||||
// MaxRune==unicode.MaxRune are verified in the tests.
|
||||
// Defining them locally avoids this package depending on package unicode.
|
||||
|
||||
// Numbers fundamental to the encoding.
|
||||
const (
|
||||
RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
|
||||
RuneSelf = 0x80 // characters below RuneSelf are represented as themselves in a single byte.
|
||||
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
|
||||
UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
|
||||
)
|
||||
|
||||
// Code points in the surrogate range are not valid for UTF-8.
|
||||
const (
|
||||
surrogateMin = 0xD800
|
||||
surrogateMax = 0xDFFF
|
||||
)
|
||||
|
||||
const (
|
||||
t1 = 0b00000000
|
||||
tx = 0b10000000
|
||||
t2 = 0b11000000
|
||||
t3 = 0b11100000
|
||||
t4 = 0b11110000
|
||||
t5 = 0b11111000
|
||||
|
||||
maskx = 0b00111111
|
||||
mask2 = 0b00011111
|
||||
mask3 = 0b00001111
|
||||
mask4 = 0b00000111
|
||||
|
||||
rune1Max = 1<<7 - 1
|
||||
rune2Max = 1<<11 - 1
|
||||
rune3Max = 1<<16 - 1
|
||||
|
||||
// The default lowest and highest continuation byte.
|
||||
locb = 0b10000000
|
||||
hicb = 0b10111111
|
||||
|
||||
// These names of these constants are chosen to give nice alignment in the
|
||||
// table below. The first nibble is an index into acceptRanges or F for
|
||||
// special one-byte cases. The second nibble is the Rune length or the
|
||||
// Status for the special one-byte case.
|
||||
xx = 0xF1 // invalid: size 1
|
||||
as = 0xF0 // ASCII: size 1
|
||||
s1 = 0x02 // accept 0, size 2
|
||||
s2 = 0x13 // accept 1, size 3
|
||||
s3 = 0x03 // accept 0, size 3
|
||||
s4 = 0x23 // accept 2, size 3
|
||||
s5 = 0x34 // accept 3, size 4
|
||||
s6 = 0x04 // accept 0, size 4
|
||||
s7 = 0x44 // accept 4, size 4
|
||||
)
|
||||
|
||||
// first is information about the first byte in a UTF-8 sequence.
|
||||
var first = [256]uint8{
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
|
||||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
|
||||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
|
||||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
|
||||
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
|
||||
}
|
||||
|
||||
// acceptRange gives the range of valid values for the second byte in a UTF-8
|
||||
// sequence.
|
||||
type acceptRange struct {
|
||||
lo uint8 // lowest value for second byte.
|
||||
hi uint8 // highest value for second byte.
|
||||
}
|
||||
|
||||
// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
|
||||
var acceptRanges = [16]acceptRange{
|
||||
0: {locb, hicb},
|
||||
1: {0xA0, hicb},
|
||||
2: {locb, 0x9F},
|
||||
3: {0x90, hicb},
|
||||
4: {locb, 0x8F},
|
||||
}
|
||||
|
||||
// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
|
||||
// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
|
||||
func FullRune(p []byte) bool {
|
||||
n := len(p)
|
||||
if n == 0 {
|
||||
return false
|
||||
}
|
||||
x := first[p[0]]
|
||||
if n >= int(x&7) {
|
||||
return true // ASCII, invalid or valid.
|
||||
}
|
||||
// Must be short or invalid.
|
||||
accept := acceptRanges[x>>4]
|
||||
if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
|
||||
return true
|
||||
} else if n > 2 && (p[2] < locb || hicb < p[2]) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// FullRuneInString is like FullRune but its input is a string.
|
||||
func FullRuneInString(s string) bool {
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return false
|
||||
}
|
||||
x := first[s[0]]
|
||||
if n >= int(x&7) {
|
||||
return true // ASCII, invalid, or valid.
|
||||
}
|
||||
// Must be short or invalid.
|
||||
accept := acceptRanges[x>>4]
|
||||
if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
|
||||
return true
|
||||
} else if n > 2 && (s[2] < locb || hicb < s[2]) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
|
||||
// its width in bytes. If p is empty it returns ([RuneError], 0). Otherwise, if
|
||||
// the encoding is invalid, it returns (RuneError, 1). Both are impossible
|
||||
// results for correct, non-empty UTF-8.
|
||||
//
|
||||
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
|
||||
// out of range, or is not the shortest possible UTF-8 encoding for the
|
||||
// value. No other validation is performed.
|
||||
func DecodeRune(p []byte) (r rune, size int) {
|
||||
n := len(p)
|
||||
if n < 1 {
|
||||
return RuneError, 0
|
||||
}
|
||||
p0 := p[0]
|
||||
x := first[p0]
|
||||
if x >= as {
|
||||
// The following code simulates an additional check for x == xx and
|
||||
// handling the ASCII and invalid cases accordingly. This mask-and-or
|
||||
// approach prevents an additional branch.
|
||||
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
|
||||
return rune(p[0])&^mask | RuneError&mask, 1
|
||||
}
|
||||
sz := int(x & 7)
|
||||
accept := acceptRanges[x>>4]
|
||||
if n < sz {
|
||||
return RuneError, 1
|
||||
}
|
||||
b1 := p[1]
|
||||
if b1 < accept.lo || accept.hi < b1 {
|
||||
return RuneError, 1
|
||||
}
|
||||
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
|
||||
return rune(p0&mask2)<<6 | rune(b1&maskx), 2
|
||||
}
|
||||
b2 := p[2]
|
||||
if b2 < locb || hicb < b2 {
|
||||
return RuneError, 1
|
||||
}
|
||||
if sz <= 3 {
|
||||
return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
|
||||
}
|
||||
b3 := p[3]
|
||||
if b3 < locb || hicb < b3 {
|
||||
return RuneError, 1
|
||||
}
|
||||
return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
|
||||
}
|
||||
|
||||
// DecodeRuneInString is like [DecodeRune] but its input is a string. If s is
|
||||
// empty it returns ([RuneError], 0). Otherwise, if the encoding is invalid, it
|
||||
// returns (RuneError, 1). Both are impossible results for correct, non-empty
|
||||
// UTF-8.
|
||||
//
|
||||
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
|
||||
// out of range, or is not the shortest possible UTF-8 encoding for the
|
||||
// value. No other validation is performed.
|
||||
func DecodeRuneInString(s string) (r rune, size int) {
|
||||
n := len(s)
|
||||
if n < 1 {
|
||||
return RuneError, 0
|
||||
}
|
||||
s0 := s[0]
|
||||
x := first[s0]
|
||||
if x >= as {
|
||||
// The following code simulates an additional check for x == xx and
|
||||
// handling the ASCII and invalid cases accordingly. This mask-and-or
|
||||
// approach prevents an additional branch.
|
||||
mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
|
||||
return rune(s[0])&^mask | RuneError&mask, 1
|
||||
}
|
||||
sz := int(x & 7)
|
||||
accept := acceptRanges[x>>4]
|
||||
if n < sz {
|
||||
return RuneError, 1
|
||||
}
|
||||
s1 := s[1]
|
||||
if s1 < accept.lo || accept.hi < s1 {
|
||||
return RuneError, 1
|
||||
}
|
||||
if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
|
||||
return rune(s0&mask2)<<6 | rune(s1&maskx), 2
|
||||
}
|
||||
s2 := s[2]
|
||||
if s2 < locb || hicb < s2 {
|
||||
return RuneError, 1
|
||||
}
|
||||
if sz <= 3 {
|
||||
return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
|
||||
}
|
||||
s3 := s[3]
|
||||
if s3 < locb || hicb < s3 {
|
||||
return RuneError, 1
|
||||
}
|
||||
return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
|
||||
}
|
||||
|
||||
// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
|
||||
// its width in bytes. If p is empty it returns ([RuneError], 0). Otherwise, if
|
||||
// the encoding is invalid, it returns (RuneError, 1). Both are impossible
|
||||
// results for correct, non-empty UTF-8.
|
||||
//
|
||||
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
|
||||
// out of range, or is not the shortest possible UTF-8 encoding for the
|
||||
// value. No other validation is performed.
|
||||
func DecodeLastRune(p []byte) (r rune, size int) {
|
||||
end := len(p)
|
||||
if end == 0 {
|
||||
return RuneError, 0
|
||||
}
|
||||
start := end - 1
|
||||
r = rune(p[start])
|
||||
if r < RuneSelf {
|
||||
return r, 1
|
||||
}
|
||||
// guard against O(n^2) behavior when traversing
|
||||
// backwards through strings with long sequences of
|
||||
// invalid UTF-8.
|
||||
lim := end - UTFMax
|
||||
if lim < 0 {
|
||||
lim = 0
|
||||
}
|
||||
for start--; start >= lim; start-- {
|
||||
if RuneStart(p[start]) {
|
||||
break
|
||||
}
|
||||
}
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
r, size = DecodeRune(p[start:end])
|
||||
if start+size != end {
|
||||
return RuneError, 1
|
||||
}
|
||||
return r, size
|
||||
}
|
||||
|
||||
// DecodeLastRuneInString is like [DecodeLastRune] but its input is a string. If
|
||||
// s is empty it returns ([RuneError], 0). Otherwise, if the encoding is invalid,
|
||||
// it returns (RuneError, 1). Both are impossible results for correct,
|
||||
// non-empty UTF-8.
|
||||
//
|
||||
// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
|
||||
// out of range, or is not the shortest possible UTF-8 encoding for the
|
||||
// value. No other validation is performed.
|
||||
func DecodeLastRuneInString(s string) (r rune, size int) {
|
||||
end := len(s)
|
||||
if end == 0 {
|
||||
return RuneError, 0
|
||||
}
|
||||
start := end - 1
|
||||
r = rune(s[start])
|
||||
if r < RuneSelf {
|
||||
return r, 1
|
||||
}
|
||||
// guard against O(n^2) behavior when traversing
|
||||
// backwards through strings with long sequences of
|
||||
// invalid UTF-8.
|
||||
lim := end - UTFMax
|
||||
if lim < 0 {
|
||||
lim = 0
|
||||
}
|
||||
for start--; start >= lim; start-- {
|
||||
if RuneStart(s[start]) {
|
||||
break
|
||||
}
|
||||
}
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
r, size = DecodeRuneInString(s[start:end])
|
||||
if start+size != end {
|
||||
return RuneError, 1
|
||||
}
|
||||
return r, size
|
||||
}
|
||||
|
||||
// RuneLen returns the number of bytes in the UTF-8 encoding of the rune.
|
||||
// It returns -1 if the rune is not a valid value to encode in UTF-8.
|
||||
func RuneLen(r rune) int {
|
||||
switch {
|
||||
case r < 0:
|
||||
return -1
|
||||
case r <= rune1Max:
|
||||
return 1
|
||||
case r <= rune2Max:
|
||||
return 2
|
||||
case surrogateMin <= r && r <= surrogateMax:
|
||||
return -1
|
||||
case r <= rune3Max:
|
||||
return 3
|
||||
case r <= MaxRune:
|
||||
return 4
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
|
||||
// If the rune is out of range, it writes the encoding of [RuneError].
|
||||
// It returns the number of bytes written.
|
||||
func EncodeRune(p []byte, r rune) int {
|
||||
// Negative values are erroneous. Making it unsigned addresses the problem.
|
||||
switch i := uint32(r); {
|
||||
case i <= rune1Max:
|
||||
p[0] = byte(r)
|
||||
return 1
|
||||
case i <= rune2Max:
|
||||
_ = p[1] // eliminate bounds checks
|
||||
p[0] = t2 | byte(r>>6)
|
||||
p[1] = tx | byte(r)&maskx
|
||||
return 2
|
||||
case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
|
||||
r = RuneError
|
||||
fallthrough
|
||||
case i <= rune3Max:
|
||||
_ = p[2] // eliminate bounds checks
|
||||
p[0] = t3 | byte(r>>12)
|
||||
p[1] = tx | byte(r>>6)&maskx
|
||||
p[2] = tx | byte(r)&maskx
|
||||
return 3
|
||||
default:
|
||||
_ = p[3] // eliminate bounds checks
|
||||
p[0] = t4 | byte(r>>18)
|
||||
p[1] = tx | byte(r>>12)&maskx
|
||||
p[2] = tx | byte(r>>6)&maskx
|
||||
p[3] = tx | byte(r)&maskx
|
||||
return 4
|
||||
}
|
||||
}
|
||||
|
||||
// AppendRune appends the UTF-8 encoding of r to the end of p and
|
||||
// returns the extended buffer. If the rune is out of range,
|
||||
// it appends the encoding of [RuneError].
|
||||
func AppendRune(p []byte, r rune) []byte {
|
||||
// This function is inlineable for fast handling of ASCII.
|
||||
if uint32(r) <= rune1Max {
|
||||
return append(p, byte(r))
|
||||
}
|
||||
return appendRuneNonASCII(p, r)
|
||||
}
|
||||
|
||||
func appendRuneNonASCII(p []byte, r rune) []byte {
|
||||
// Negative values are erroneous. Making it unsigned addresses the problem.
|
||||
switch i := uint32(r); {
|
||||
case i <= rune2Max:
|
||||
return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
|
||||
case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
|
||||
r = RuneError
|
||||
fallthrough
|
||||
case i <= rune3Max:
|
||||
return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
|
||||
default:
|
||||
return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
|
||||
}
|
||||
}
|
||||
|
||||
// RuneCount returns the number of runes in p. Erroneous and short
|
||||
// encodings are treated as single runes of width 1 byte.
|
||||
func RuneCount(p []byte) int {
|
||||
np := len(p)
|
||||
var n int
|
||||
for i := 0; i < np; {
|
||||
n++
|
||||
c := p[i]
|
||||
if c < RuneSelf {
|
||||
// ASCII fast path
|
||||
i++
|
||||
continue
|
||||
}
|
||||
x := first[c]
|
||||
if x == xx {
|
||||
i++ // invalid.
|
||||
continue
|
||||
}
|
||||
size := int(x & 7)
|
||||
if i+size > np {
|
||||
i++ // Short or invalid.
|
||||
continue
|
||||
}
|
||||
accept := acceptRanges[x>>4]
|
||||
if c := p[i+1]; c < accept.lo || accept.hi < c {
|
||||
size = 1
|
||||
} else if size == 2 {
|
||||
} else if c := p[i+2]; c < locb || hicb < c {
|
||||
size = 1
|
||||
} else if size == 3 {
|
||||
} else if c := p[i+3]; c < locb || hicb < c {
|
||||
size = 1
|
||||
}
|
||||
i += size
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// RuneCountInString is like [RuneCount] but its input is a string.
|
||||
func RuneCountInString(s string) (n int) {
|
||||
ns := len(s)
|
||||
for i := 0; i < ns; n++ {
|
||||
c := s[i]
|
||||
if c < RuneSelf {
|
||||
// ASCII fast path
|
||||
i++
|
||||
continue
|
||||
}
|
||||
x := first[c]
|
||||
if x == xx {
|
||||
i++ // invalid.
|
||||
continue
|
||||
}
|
||||
size := int(x & 7)
|
||||
if i+size > ns {
|
||||
i++ // Short or invalid.
|
||||
continue
|
||||
}
|
||||
accept := acceptRanges[x>>4]
|
||||
if c := s[i+1]; c < accept.lo || accept.hi < c {
|
||||
size = 1
|
||||
} else if size == 2 {
|
||||
} else if c := s[i+2]; c < locb || hicb < c {
|
||||
size = 1
|
||||
} else if size == 3 {
|
||||
} else if c := s[i+3]; c < locb || hicb < c {
|
||||
size = 1
|
||||
}
|
||||
i += size
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// RuneStart reports whether the byte could be the first byte of an encoded,
|
||||
// possibly invalid rune. Second and subsequent bytes always have the top two
|
||||
// bits set to 10.
|
||||
func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
|
||||
|
||||
// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
|
||||
func Valid(p []byte) bool {
|
||||
// This optimization avoids the need to recompute the capacity
|
||||
// when generating code for p[8:], bringing it to parity with
|
||||
// ValidString, which was 20% faster on long ASCII strings.
|
||||
p = p[:len(p):len(p)]
|
||||
|
||||
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
|
||||
for len(p) >= 8 {
|
||||
// Combining two 32 bit loads allows the same code to be used
|
||||
// for 32 and 64 bit platforms.
|
||||
// The compiler can generate a 32bit load for first32 and second32
|
||||
// on many platforms. See test/codegen/memcombine.go.
|
||||
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
|
||||
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
|
||||
if (first32|second32)&0x80808080 != 0 {
|
||||
// Found a non ASCII byte (>= RuneSelf).
|
||||
break
|
||||
}
|
||||
p = p[8:]
|
||||
}
|
||||
n := len(p)
|
||||
for i := 0; i < n; {
|
||||
pi := p[i]
|
||||
if pi < RuneSelf {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
x := first[pi]
|
||||
if x == xx {
|
||||
return false // Illegal starter byte.
|
||||
}
|
||||
size := int(x & 7)
|
||||
if i+size > n {
|
||||
return false // Short or invalid.
|
||||
}
|
||||
accept := acceptRanges[x>>4]
|
||||
if c := p[i+1]; c < accept.lo || accept.hi < c {
|
||||
return false
|
||||
} else if size == 2 {
|
||||
} else if c := p[i+2]; c < locb || hicb < c {
|
||||
return false
|
||||
} else if size == 3 {
|
||||
} else if c := p[i+3]; c < locb || hicb < c {
|
||||
return false
|
||||
}
|
||||
i += size
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
|
||||
func ValidString(s string) bool {
|
||||
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
|
||||
for len(s) >= 8 {
|
||||
// Combining two 32 bit loads allows the same code to be used
|
||||
// for 32 and 64 bit platforms.
|
||||
// The compiler can generate a 32bit load for first32 and second32
|
||||
// on many platforms. See test/codegen/memcombine.go.
|
||||
first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
|
||||
second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
|
||||
if (first32|second32)&0x80808080 != 0 {
|
||||
// Found a non ASCII byte (>= RuneSelf).
|
||||
break
|
||||
}
|
||||
s = s[8:]
|
||||
}
|
||||
n := len(s)
|
||||
for i := 0; i < n; {
|
||||
si := s[i]
|
||||
if si < RuneSelf {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
x := first[si]
|
||||
if x == xx {
|
||||
return false // Illegal starter byte.
|
||||
}
|
||||
size := int(x & 7)
|
||||
if i+size > n {
|
||||
return false // Short or invalid.
|
||||
}
|
||||
accept := acceptRanges[x>>4]
|
||||
if c := s[i+1]; c < accept.lo || accept.hi < c {
|
||||
return false
|
||||
} else if size == 2 {
|
||||
} else if c := s[i+2]; c < locb || hicb < c {
|
||||
return false
|
||||
} else if size == 3 {
|
||||
} else if c := s[i+3]; c < locb || hicb < c {
|
||||
return false
|
||||
}
|
||||
i += size
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ValidRune reports whether r can be legally encoded as UTF-8.
|
||||
// Code points that are out of range or a surrogate half are illegal.
|
||||
func ValidRune(r rune) bool {
|
||||
switch {
|
||||
case 0 <= r && r < surrogateMin:
|
||||
return true
|
||||
case surrogateMax < r && r <= MaxRune:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
703
src/unicode/utf8/utf8_test.go
Normal file
703
src/unicode/utf8/utf8_test.go
Normal file
@@ -0,0 +1,703 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package utf8_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode"
|
||||
. "unicode/utf8"
|
||||
)
|
||||
|
||||
// Validate the constants redefined from unicode.
|
||||
func init() {
|
||||
if MaxRune != unicode.MaxRune {
|
||||
panic("utf8.MaxRune is wrong")
|
||||
}
|
||||
if RuneError != unicode.ReplacementChar {
|
||||
panic("utf8.RuneError is wrong")
|
||||
}
|
||||
}
|
||||
|
||||
// Validate the constants redefined from unicode.
|
||||
func TestConstants(t *testing.T) {
|
||||
if MaxRune != unicode.MaxRune {
|
||||
t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
|
||||
}
|
||||
if RuneError != unicode.ReplacementChar {
|
||||
t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
|
||||
}
|
||||
}
|
||||
|
||||
type Utf8Map struct {
|
||||
r rune
|
||||
str string
|
||||
}
|
||||
|
||||
var utf8map = []Utf8Map{
|
||||
{0x0000, "\x00"},
|
||||
{0x0001, "\x01"},
|
||||
{0x007e, "\x7e"},
|
||||
{0x007f, "\x7f"},
|
||||
{0x0080, "\xc2\x80"},
|
||||
{0x0081, "\xc2\x81"},
|
||||
{0x00bf, "\xc2\xbf"},
|
||||
{0x00c0, "\xc3\x80"},
|
||||
{0x00c1, "\xc3\x81"},
|
||||
{0x00c8, "\xc3\x88"},
|
||||
{0x00d0, "\xc3\x90"},
|
||||
{0x00e0, "\xc3\xa0"},
|
||||
{0x00f0, "\xc3\xb0"},
|
||||
{0x00f8, "\xc3\xb8"},
|
||||
{0x00ff, "\xc3\xbf"},
|
||||
{0x0100, "\xc4\x80"},
|
||||
{0x07ff, "\xdf\xbf"},
|
||||
{0x0400, "\xd0\x80"},
|
||||
{0x0800, "\xe0\xa0\x80"},
|
||||
{0x0801, "\xe0\xa0\x81"},
|
||||
{0x1000, "\xe1\x80\x80"},
|
||||
{0xd000, "\xed\x80\x80"},
|
||||
{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
|
||||
{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
|
||||
{0xfffe, "\xef\xbf\xbe"},
|
||||
{0xffff, "\xef\xbf\xbf"},
|
||||
{0x10000, "\xf0\x90\x80\x80"},
|
||||
{0x10001, "\xf0\x90\x80\x81"},
|
||||
{0x40000, "\xf1\x80\x80\x80"},
|
||||
{0x10fffe, "\xf4\x8f\xbf\xbe"},
|
||||
{0x10ffff, "\xf4\x8f\xbf\xbf"},
|
||||
{0xFFFD, "\xef\xbf\xbd"},
|
||||
}
|
||||
|
||||
var surrogateMap = []Utf8Map{
|
||||
{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
|
||||
{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
|
||||
}
|
||||
|
||||
var testStrings = []string{
|
||||
"",
|
||||
"abcd",
|
||||
"☺☻☹",
|
||||
"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
|
||||
"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
|
||||
"\x80\x80\x80\x80",
|
||||
}
|
||||
|
||||
func TestFullRune(t *testing.T) {
|
||||
for _, m := range utf8map {
|
||||
b := []byte(m.str)
|
||||
if !FullRune(b) {
|
||||
t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
|
||||
}
|
||||
s := m.str
|
||||
if !FullRuneInString(s) {
|
||||
t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
|
||||
}
|
||||
b1 := b[0 : len(b)-1]
|
||||
if FullRune(b1) {
|
||||
t.Errorf("FullRune(%q) = true, want false", b1)
|
||||
}
|
||||
s1 := string(b1)
|
||||
if FullRuneInString(s1) {
|
||||
t.Errorf("FullRune(%q) = true, want false", s1)
|
||||
}
|
||||
}
|
||||
for _, s := range []string{"\xc0", "\xc1"} {
|
||||
b := []byte(s)
|
||||
if !FullRune(b) {
|
||||
t.Errorf("FullRune(%q) = false, want true", s)
|
||||
}
|
||||
if !FullRuneInString(s) {
|
||||
t.Errorf("FullRuneInString(%q) = false, want true", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEncodeRune(t *testing.T) {
|
||||
for _, m := range utf8map {
|
||||
b := []byte(m.str)
|
||||
var buf [10]byte
|
||||
n := EncodeRune(buf[0:], m.r)
|
||||
b1 := buf[0:n]
|
||||
if !bytes.Equal(b, b1) {
|
||||
t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendRune(t *testing.T) {
|
||||
for _, m := range utf8map {
|
||||
if buf := AppendRune(nil, m.r); string(buf) != m.str {
|
||||
t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
|
||||
}
|
||||
if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
|
||||
t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeRune(t *testing.T) {
|
||||
for _, m := range utf8map {
|
||||
b := []byte(m.str)
|
||||
r, size := DecodeRune(b)
|
||||
if r != m.r || size != len(b) {
|
||||
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
|
||||
}
|
||||
s := m.str
|
||||
r, size = DecodeRuneInString(s)
|
||||
if r != m.r || size != len(b) {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
|
||||
}
|
||||
|
||||
// there's an extra byte that bytes left behind - make sure trailing byte works
|
||||
r, size = DecodeRune(b[0:cap(b)])
|
||||
if r != m.r || size != len(b) {
|
||||
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
|
||||
}
|
||||
s = m.str + "\x00"
|
||||
r, size = DecodeRuneInString(s)
|
||||
if r != m.r || size != len(b) {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
|
||||
}
|
||||
|
||||
// make sure missing bytes fail
|
||||
wantsize := 1
|
||||
if wantsize >= len(b) {
|
||||
wantsize = 0
|
||||
}
|
||||
r, size = DecodeRune(b[0 : len(b)-1])
|
||||
if r != RuneError || size != wantsize {
|
||||
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
|
||||
}
|
||||
s = m.str[0 : len(m.str)-1]
|
||||
r, size = DecodeRuneInString(s)
|
||||
if r != RuneError || size != wantsize {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
|
||||
}
|
||||
|
||||
// make sure bad sequences fail
|
||||
if len(b) == 1 {
|
||||
b[0] = 0x80
|
||||
} else {
|
||||
b[len(b)-1] = 0x7F
|
||||
}
|
||||
r, size = DecodeRune(b)
|
||||
if r != RuneError || size != 1 {
|
||||
t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
|
||||
}
|
||||
s = string(b)
|
||||
r, size = DecodeRuneInString(s)
|
||||
if r != RuneError || size != 1 {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeSurrogateRune(t *testing.T) {
|
||||
for _, m := range surrogateMap {
|
||||
b := []byte(m.str)
|
||||
r, size := DecodeRune(b)
|
||||
if r != RuneError || size != 1 {
|
||||
t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
|
||||
}
|
||||
s := m.str
|
||||
r, size = DecodeRuneInString(s)
|
||||
if r != RuneError || size != 1 {
|
||||
t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check that DecodeRune and DecodeLastRune correspond to
|
||||
// the equivalent range loop.
|
||||
func TestSequencing(t *testing.T) {
|
||||
for _, ts := range testStrings {
|
||||
for _, m := range utf8map {
|
||||
for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
|
||||
testSequence(t, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func runtimeRuneCount(s string) int {
|
||||
return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s).
|
||||
}
|
||||
|
||||
// Check that a range loop, len([]rune(string)) optimization and
|
||||
// []rune conversions visit the same runes.
|
||||
// Not really a test of this package, but the assumption is used here and
|
||||
// it's good to verify.
|
||||
func TestRuntimeConversion(t *testing.T) {
|
||||
for _, ts := range testStrings {
|
||||
count := RuneCountInString(ts)
|
||||
if n := runtimeRuneCount(ts); n != count {
|
||||
t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count)
|
||||
break
|
||||
}
|
||||
|
||||
runes := []rune(ts)
|
||||
if n := len(runes); n != count {
|
||||
t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count)
|
||||
break
|
||||
}
|
||||
i := 0
|
||||
for _, r := range ts {
|
||||
if r != runes[i] {
|
||||
t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
|
||||
}
|
||||
i++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var invalidSequenceTests = []string{
|
||||
"\xed\xa0\x80\x80", // surrogate min
|
||||
"\xed\xbf\xbf\x80", // surrogate max
|
||||
|
||||
// xx
|
||||
"\x91\x80\x80\x80",
|
||||
|
||||
// s1
|
||||
"\xC2\x7F\x80\x80",
|
||||
"\xC2\xC0\x80\x80",
|
||||
"\xDF\x7F\x80\x80",
|
||||
"\xDF\xC0\x80\x80",
|
||||
|
||||
// s2
|
||||
"\xE0\x9F\xBF\x80",
|
||||
"\xE0\xA0\x7F\x80",
|
||||
"\xE0\xBF\xC0\x80",
|
||||
"\xE0\xC0\x80\x80",
|
||||
|
||||
// s3
|
||||
"\xE1\x7F\xBF\x80",
|
||||
"\xE1\x80\x7F\x80",
|
||||
"\xE1\xBF\xC0\x80",
|
||||
"\xE1\xC0\x80\x80",
|
||||
|
||||
//s4
|
||||
"\xED\x7F\xBF\x80",
|
||||
"\xED\x80\x7F\x80",
|
||||
"\xED\x9F\xC0\x80",
|
||||
"\xED\xA0\x80\x80",
|
||||
|
||||
// s5
|
||||
"\xF0\x8F\xBF\xBF",
|
||||
"\xF0\x90\x7F\xBF",
|
||||
"\xF0\x90\x80\x7F",
|
||||
"\xF0\xBF\xBF\xC0",
|
||||
"\xF0\xBF\xC0\x80",
|
||||
"\xF0\xC0\x80\x80",
|
||||
|
||||
// s6
|
||||
"\xF1\x7F\xBF\xBF",
|
||||
"\xF1\x80\x7F\xBF",
|
||||
"\xF1\x80\x80\x7F",
|
||||
"\xF1\xBF\xBF\xC0",
|
||||
"\xF1\xBF\xC0\x80",
|
||||
"\xF1\xC0\x80\x80",
|
||||
|
||||
// s7
|
||||
"\xF4\x7F\xBF\xBF",
|
||||
"\xF4\x80\x7F\xBF",
|
||||
"\xF4\x80\x80\x7F",
|
||||
"\xF4\x8F\xBF\xC0",
|
||||
"\xF4\x8F\xC0\x80",
|
||||
"\xF4\x90\x80\x80",
|
||||
}
|
||||
|
||||
func runtimeDecodeRune(s string) rune {
|
||||
for _, r := range s {
|
||||
return r
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func TestDecodeInvalidSequence(t *testing.T) {
|
||||
for _, s := range invalidSequenceTests {
|
||||
r1, _ := DecodeRune([]byte(s))
|
||||
if want := RuneError; r1 != want {
|
||||
t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
|
||||
return
|
||||
}
|
||||
r2, _ := DecodeRuneInString(s)
|
||||
if want := RuneError; r2 != want {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
|
||||
return
|
||||
}
|
||||
if r1 != r2 {
|
||||
t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
|
||||
return
|
||||
}
|
||||
r3 := runtimeDecodeRune(s)
|
||||
if r2 != r3 {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testSequence(t *testing.T, s string) {
|
||||
type info struct {
|
||||
index int
|
||||
r rune
|
||||
}
|
||||
index := make([]info, len(s))
|
||||
b := []byte(s)
|
||||
si := 0
|
||||
j := 0
|
||||
for i, r := range s {
|
||||
if si != i {
|
||||
t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
|
||||
return
|
||||
}
|
||||
index[j] = info{i, r}
|
||||
j++
|
||||
r1, size1 := DecodeRune(b[i:])
|
||||
if r != r1 {
|
||||
t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
|
||||
return
|
||||
}
|
||||
r2, size2 := DecodeRuneInString(s[i:])
|
||||
if r != r2 {
|
||||
t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
|
||||
return
|
||||
}
|
||||
if size1 != size2 {
|
||||
t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
|
||||
return
|
||||
}
|
||||
si += size1
|
||||
}
|
||||
j--
|
||||
for si = len(s); si > 0; {
|
||||
r1, size1 := DecodeLastRune(b[0:si])
|
||||
r2, size2 := DecodeLastRuneInString(s[0:si])
|
||||
if size1 != size2 {
|
||||
t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
|
||||
return
|
||||
}
|
||||
if r1 != index[j].r {
|
||||
t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
|
||||
return
|
||||
}
|
||||
if r2 != index[j].r {
|
||||
t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
|
||||
return
|
||||
}
|
||||
si -= size1
|
||||
if si != index[j].index {
|
||||
t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
|
||||
return
|
||||
}
|
||||
j--
|
||||
}
|
||||
if si != 0 {
|
||||
t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
|
||||
}
|
||||
}
|
||||
|
||||
// Check that negative runes encode as U+FFFD.
|
||||
func TestNegativeRune(t *testing.T) {
|
||||
errorbuf := make([]byte, UTFMax)
|
||||
errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
|
||||
buf := make([]byte, UTFMax)
|
||||
buf = buf[0:EncodeRune(buf, -1)]
|
||||
if !bytes.Equal(buf, errorbuf) {
|
||||
t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
|
||||
}
|
||||
}
|
||||
|
||||
type RuneCountTest struct {
|
||||
in string
|
||||
out int
|
||||
}
|
||||
|
||||
var runecounttests = []RuneCountTest{
|
||||
{"abcd", 4},
|
||||
{"☺☻☹", 3},
|
||||
{"1,2,3,4", 7},
|
||||
{"\xe2\x00", 2},
|
||||
{"\xe2\x80", 2},
|
||||
{"a\xe2\x80", 3},
|
||||
}
|
||||
|
||||
func TestRuneCount(t *testing.T) {
|
||||
for _, tt := range runecounttests {
|
||||
if out := RuneCountInString(tt.in); out != tt.out {
|
||||
t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
|
||||
}
|
||||
if out := RuneCount([]byte(tt.in)); out != tt.out {
|
||||
t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type RuneLenTest struct {
|
||||
r rune
|
||||
size int
|
||||
}
|
||||
|
||||
var runelentests = []RuneLenTest{
|
||||
{0, 1},
|
||||
{'e', 1},
|
||||
{'é', 2},
|
||||
{'☺', 3},
|
||||
{RuneError, 3},
|
||||
{MaxRune, 4},
|
||||
{0xD800, -1},
|
||||
{0xDFFF, -1},
|
||||
{MaxRune + 1, -1},
|
||||
{-1, -1},
|
||||
}
|
||||
|
||||
func TestRuneLen(t *testing.T) {
|
||||
for _, tt := range runelentests {
|
||||
if size := RuneLen(tt.r); size != tt.size {
|
||||
t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type ValidTest struct {
|
||||
in string
|
||||
out bool
|
||||
}
|
||||
|
||||
var validTests = []ValidTest{
|
||||
{"", true},
|
||||
{"a", true},
|
||||
{"abc", true},
|
||||
{"Ж", true},
|
||||
{"ЖЖ", true},
|
||||
{"брэд-ЛГТМ", true},
|
||||
{"☺☻☹", true},
|
||||
{"aa\xe2", false},
|
||||
{string([]byte{66, 250}), false},
|
||||
{string([]byte{66, 250, 67}), false},
|
||||
{"a\uFFFDb", true},
|
||||
{string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF
|
||||
{string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range
|
||||
{string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range
|
||||
{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
|
||||
{string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect
|
||||
{string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic)
|
||||
{string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic)
|
||||
}
|
||||
|
||||
func TestValid(t *testing.T) {
|
||||
for _, tt := range validTests {
|
||||
if Valid([]byte(tt.in)) != tt.out {
|
||||
t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
|
||||
}
|
||||
if ValidString(tt.in) != tt.out {
|
||||
t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type ValidRuneTest struct {
|
||||
r rune
|
||||
ok bool
|
||||
}
|
||||
|
||||
var validrunetests = []ValidRuneTest{
|
||||
{0, true},
|
||||
{'e', true},
|
||||
{'é', true},
|
||||
{'☺', true},
|
||||
{RuneError, true},
|
||||
{MaxRune, true},
|
||||
{0xD7FF, true},
|
||||
{0xD800, false},
|
||||
{0xDFFF, false},
|
||||
{0xE000, true},
|
||||
{MaxRune + 1, false},
|
||||
{-1, false},
|
||||
}
|
||||
|
||||
func TestValidRune(t *testing.T) {
|
||||
for _, tt := range validrunetests {
|
||||
if ok := ValidRune(tt.r); ok != tt.ok {
|
||||
t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
|
||||
s := []byte("0123456789")
|
||||
for i := 0; i < b.N; i++ {
|
||||
RuneCount(s)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
|
||||
s := []byte("日本語日本語日本語日")
|
||||
for i := 0; i < b.N; i++ {
|
||||
RuneCount(s)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
RuneCountInString("0123456789")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
RuneCountInString("日本語日本語日本語日")
|
||||
}
|
||||
}
|
||||
|
||||
var ascii100000 = strings.Repeat("0123456789", 10000)
|
||||
|
||||
func BenchmarkValidTenASCIIChars(b *testing.B) {
|
||||
s := []byte("0123456789")
|
||||
for i := 0; i < b.N; i++ {
|
||||
Valid(s)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValid100KASCIIChars(b *testing.B) {
|
||||
s := []byte(ascii100000)
|
||||
for i := 0; i < b.N; i++ {
|
||||
Valid(s)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidTenJapaneseChars(b *testing.B) {
|
||||
s := []byte("日本語日本語日本語日")
|
||||
for i := 0; i < b.N; i++ {
|
||||
Valid(s)
|
||||
}
|
||||
}
|
||||
func BenchmarkValidLongMostlyASCII(b *testing.B) {
|
||||
longMostlyASCII := []byte(longStringMostlyASCII)
|
||||
for i := 0; i < b.N; i++ {
|
||||
Valid(longMostlyASCII)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidLongJapanese(b *testing.B) {
|
||||
longJapanese := []byte(longStringJapanese)
|
||||
for i := 0; i < b.N; i++ {
|
||||
Valid(longJapanese)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidStringTenASCIIChars(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
ValidString("0123456789")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidString100KASCIIChars(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
ValidString(ascii100000)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
ValidString("日本語日本語日本語日")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
ValidString(longStringMostlyASCII)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkValidStringLongJapanese(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
ValidString(longStringJapanese)
|
||||
}
|
||||
}
|
||||
|
||||
var longStringMostlyASCII string // ~100KB, ~97% ASCII
|
||||
var longStringJapanese string // ~100KB, non-ASCII
|
||||
|
||||
func init() {
|
||||
const japanese = "日本語日本語日本語日"
|
||||
var b strings.Builder
|
||||
for i := 0; b.Len() < 100_000; i++ {
|
||||
if i%100 == 0 {
|
||||
b.WriteString(japanese)
|
||||
} else {
|
||||
b.WriteString("0123456789")
|
||||
}
|
||||
}
|
||||
longStringMostlyASCII = b.String()
|
||||
longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
|
||||
}
|
||||
|
||||
func BenchmarkEncodeASCIIRune(b *testing.B) {
|
||||
buf := make([]byte, UTFMax)
|
||||
for i := 0; i < b.N; i++ {
|
||||
EncodeRune(buf, 'a')
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEncodeJapaneseRune(b *testing.B) {
|
||||
buf := make([]byte, UTFMax)
|
||||
for i := 0; i < b.N; i++ {
|
||||
EncodeRune(buf, '本')
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAppendASCIIRune(b *testing.B) {
|
||||
buf := make([]byte, UTFMax)
|
||||
for i := 0; i < b.N; i++ {
|
||||
AppendRune(buf[:0], 'a')
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAppendJapaneseRune(b *testing.B) {
|
||||
buf := make([]byte, UTFMax)
|
||||
for i := 0; i < b.N; i++ {
|
||||
AppendRune(buf[:0], '本')
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeASCIIRune(b *testing.B) {
|
||||
a := []byte{'a'}
|
||||
for i := 0; i < b.N; i++ {
|
||||
DecodeRune(a)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeJapaneseRune(b *testing.B) {
|
||||
nihon := []byte("本")
|
||||
for i := 0; i < b.N; i++ {
|
||||
DecodeRune(nihon)
|
||||
}
|
||||
}
|
||||
|
||||
// boolSink is used to reference the return value of benchmarked
|
||||
// functions to avoid dead code elimination.
|
||||
var boolSink bool
|
||||
|
||||
func BenchmarkFullRune(b *testing.B) {
|
||||
benchmarks := []struct {
|
||||
name string
|
||||
data []byte
|
||||
}{
|
||||
{"ASCII", []byte("a")},
|
||||
{"Incomplete", []byte("\xf0\x90\x80")},
|
||||
{"Japanese", []byte("本")},
|
||||
}
|
||||
for _, bm := range benchmarks {
|
||||
b.Run(bm.name, func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
boolSink = FullRune(bm.data)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user