Initial commit: Go 1.23 release state

2024-09-21 23:49:08 +10:00
commit 17cd57a668
13231 changed files with 3114330 additions and 0 deletions
--- a/src/internal/abi/abi.go
+++ b/src/internal/abi/abi.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// RegArgs is a struct that has space for each argument
+// and return value register on the current architecture.
+//
+// Assembly code knows the layout of the first two fields
+// of RegArgs.
+//
+// RegArgs also contains additional space to hold pointers
+// when it may not be safe to keep them only in the integer
+// register space otherwise.
+type RegArgs struct {
+	// Values in these slots should be precisely the bit-by-bit
+	// representation of how they would appear in a register.
+	//
+	// This means that on big endian arches, integer values should
+	// be in the top bits of the slot. Floats are usually just
+	// directly represented, but some architectures treat narrow
+	// width floating point values specially (e.g. they're promoted
+	// first, or they need to be NaN-boxed).
+	Ints   [IntArgRegs]uintptr  // untyped integer registers
+	Floats [FloatArgRegs]uint64 // untyped float registers
+
+	// Fields above this point are known to assembly.
+
+	// Ptrs is a space that duplicates Ints but with pointer type,
+	// used to make pointers passed or returned  in registers
+	// visible to the GC by making the type unsafe.Pointer.
+	Ptrs [IntArgRegs]unsafe.Pointer
+
+	// ReturnIsPtr is a bitmap that indicates which registers
+	// contain or will contain pointers on the return path from
+	// a reflectcall. The i'th bit indicates whether the i'th
+	// register contains or will contain a valid Go pointer.
+	ReturnIsPtr IntArgRegBitmap
+}
+
+func (r *RegArgs) Dump() {
+	print("Ints:")
+	for _, x := range r.Ints {
+		print(" ", x)
+	}
+	println()
+	print("Floats:")
+	for _, x := range r.Floats {
+		print(" ", x)
+	}
+	println()
+	print("Ptrs:")
+	for _, x := range r.Ptrs {
+		print(" ", x)
+	}
+	println()
+}
+
+// IntRegArgAddr returns a pointer inside of r.Ints[reg] that is appropriately
+// offset for an argument of size argSize.
+//
+// argSize must be non-zero, fit in a register, and a power-of-two.
+//
+// This method is a helper for dealing with the endianness of different CPU
+// architectures, since sub-word-sized arguments in big endian architectures
+// need to be "aligned" to the upper edge of the register to be interpreted
+// by the CPU correctly.
+func (r *RegArgs) IntRegArgAddr(reg int, argSize uintptr) unsafe.Pointer {
+	if argSize > goarch.PtrSize || argSize == 0 || argSize&(argSize-1) != 0 {
+		panic("invalid argSize")
+	}
+	offset := uintptr(0)
+	if goarch.BigEndian {
+		offset = goarch.PtrSize - argSize
+	}
+	return unsafe.Pointer(uintptr(unsafe.Pointer(&r.Ints[reg])) + offset)
+}
+
+// IntArgRegBitmap is a bitmap large enough to hold one bit per
+// integer argument/return register.
+type IntArgRegBitmap [(IntArgRegs + 7) / 8]uint8
+
+// Set sets the i'th bit of the bitmap to 1.
+func (b *IntArgRegBitmap) Set(i int) {
+	b[i/8] |= uint8(1) << (i % 8)
+}
+
+// Get returns whether the i'th bit of the bitmap is set.
+//
+// nosplit because it's called in extremely sensitive contexts, like
+// on the reflectcall return path.
+//
+//go:nosplit
+func (b *IntArgRegBitmap) Get(i int) bool {
+	return b[i/8]&(uint8(1)<<(i%8)) != 0
+}
--- a/src/internal/abi/abi_amd64.go
+++ b/src/internal/abi/abi_amd64.go
@@ -0,0 +1,18 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+const (
+	// See abi_generic.go.
+
+	// RAX, RBX, RCX, RDI, RSI, R8, R9, R10, R11.
+	IntArgRegs = 9
+
+	// X0 -> X14.
+	FloatArgRegs = 15
+
+	// We use SSE2 registers which support 64-bit float operations.
+	EffectiveFloatRegSize = 8
+)
--- a/src/internal/abi/abi_arm64.go
+++ b/src/internal/abi/abi_arm64.go
@@ -0,0 +1,17 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+const (
+	// See abi_generic.go.
+
+	// R0 - R15.
+	IntArgRegs = 16
+
+	// F0 - F15.
+	FloatArgRegs = 16
+
+	EffectiveFloatRegSize = 8
+)
--- a/src/internal/abi/abi_generic.go
+++ b/src/internal/abi/abi_generic.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.regabiargs && !amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le && !riscv64
+
+package abi
+
+const (
+	// ABI-related constants.
+	//
+	// In the generic case, these are all zero
+	// which lets them gracefully degrade to ABI0.
+
+	// IntArgRegs is the number of registers dedicated
+	// to passing integer argument values. Result registers are identical
+	// to argument registers, so this number is used for those too.
+	IntArgRegs = 0
+
+	// FloatArgRegs is the number of registers dedicated
+	// to passing floating-point argument values. Result registers are
+	// identical to argument registers, so this number is used for
+	// those too.
+	FloatArgRegs = 0
+
+	// EffectiveFloatRegSize describes the width of floating point
+	// registers on the current platform from the ABI's perspective.
+	//
+	// Since Go only supports 32-bit and 64-bit floating point primitives,
+	// this number should be either 0, 4, or 8. 0 indicates no floating
+	// point registers for the ABI or that floating point values will be
+	// passed via the softfloat ABI.
+	//
+	// For platforms that support larger floating point register widths,
+	// such as x87's 80-bit "registers" (not that we support x87 currently),
+	// use 8.
+	EffectiveFloatRegSize = 0
+)
--- a/src/internal/abi/abi_loong64.go
+++ b/src/internal/abi/abi_loong64.go
@@ -0,0 +1,17 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+const (
+	// See abi_generic.go.
+
+	// R4 - R19
+	IntArgRegs = 16
+
+	// F0 - F15
+	FloatArgRegs = 16
+
+	EffectiveFloatRegSize = 8
+)
--- a/src/internal/abi/abi_ppc64x.go
+++ b/src/internal/abi/abi_ppc64x.go
@@ -0,0 +1,19 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+package abi
+
+const (
+	// See abi_generic.go.
+
+	// R3 - R10, R14 - R17.
+	IntArgRegs = 12
+
+	// F1 - F12.
+	FloatArgRegs = 12
+
+	EffectiveFloatRegSize = 8
+)
--- a/src/internal/abi/abi_riscv64.go
+++ b/src/internal/abi/abi_riscv64.go
@@ -0,0 +1,17 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+const (
+	// See abi_generic.go.
+
+	// X8 - X23
+	IntArgRegs = 16
+
+	// F8 - F23.
+	FloatArgRegs = 16
+
+	EffectiveFloatRegSize = 8
+)
--- a/src/internal/abi/abi_test.go
+++ b/src/internal/abi/abi_test.go
@@ -0,0 +1,79 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi_test
+
+import (
+	"internal/abi"
+	"internal/testenv"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestFuncPC(t *testing.T) {
+	// Test that FuncPC* can get correct function PC.
+	pcFromAsm := abi.FuncPCTestFnAddr
+
+	// Test FuncPC for locally defined function
+	pcFromGo := abi.FuncPCTest()
+	if pcFromGo != pcFromAsm {
+		t.Errorf("FuncPC returns wrong PC, want %x, got %x", pcFromAsm, pcFromGo)
+	}
+
+	// Test FuncPC for imported function
+	pcFromGo = abi.FuncPCABI0(abi.FuncPCTestFn)
+	if pcFromGo != pcFromAsm {
+		t.Errorf("FuncPC returns wrong PC, want %x, got %x", pcFromAsm, pcFromGo)
+	}
+}
+
+func TestFuncPCCompileError(t *testing.T) {
+	// Test that FuncPC* on a function of a mismatched ABI is rejected.
+	testenv.MustHaveGoBuild(t)
+
+	// We want to test internal package, which we cannot normally import.
+	// Run the assembler and compiler manually.
+	tmpdir := t.TempDir()
+	asmSrc := filepath.Join("testdata", "x.s")
+	goSrc := filepath.Join("testdata", "x.go")
+	symabi := filepath.Join(tmpdir, "symabi")
+	obj := filepath.Join(tmpdir, "x.o")
+
+	// Write an importcfg file for the dependencies of the package.
+	importcfgfile := filepath.Join(tmpdir, "hello.importcfg")
+	testenv.WriteImportcfg(t, importcfgfile, nil, "internal/abi")
+
+	// parse assembly code for symabi.
+	cmd := testenv.Command(t, testenv.GoToolPath(t), "tool", "asm", "-p=p", "-gensymabis", "-o", symabi, asmSrc)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("go tool asm -gensymabis failed: %v\n%s", err, out)
+	}
+
+	// compile go code.
+	cmd = testenv.Command(t, testenv.GoToolPath(t), "tool", "compile", "-importcfg="+importcfgfile, "-p=p", "-symabis", symabi, "-o", obj, goSrc)
+	out, err = cmd.CombinedOutput()
+	if err == nil {
+		t.Fatalf("go tool compile did not fail")
+	}
+
+	// Expect errors in line 17, 18, 20, no errors on other lines.
+	want := []string{"x.go:17", "x.go:18", "x.go:20"}
+	got := strings.Split(string(out), "\n")
+	if got[len(got)-1] == "" {
+		got = got[:len(got)-1] // remove last empty line
+	}
+	for i, s := range got {
+		if !strings.Contains(s, want[i]) {
+			t.Errorf("did not error on line %s", want[i])
+		}
+	}
+	if len(got) != len(want) {
+		t.Errorf("unexpected number of errors, want %d, got %d", len(want), len(got))
+	}
+	if t.Failed() {
+		t.Logf("output:\n%s", string(out))
+	}
+}
--- a/src/internal/abi/abi_test.s
+++ b/src/internal/abi/abi_test.s
@@ -0,0 +1,27 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#ifdef GOARCH_386
+#define PTRSIZE 4
+#endif
+#ifdef GOARCH_arm
+#define PTRSIZE 4
+#endif
+#ifdef GOARCH_mips
+#define PTRSIZE 4
+#endif
+#ifdef GOARCH_mipsle
+#define PTRSIZE 4
+#endif
+#ifndef PTRSIZE
+#define PTRSIZE 8
+#endif
+
+TEXT	internal∕abi·FuncPCTestFn(SB),NOSPLIT,$0-0
+	RET
+
+GLOBL	internal∕abi·FuncPCTestFnAddr(SB), NOPTR, $PTRSIZE
+DATA	internal∕abi·FuncPCTestFnAddr(SB)/PTRSIZE, $internal∕abi·FuncPCTestFn(SB)
--- a/src/internal/abi/compiletype.go
+++ b/src/internal/abi/compiletype.go
@@ -0,0 +1,28 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+// These functions are the build-time version of the Go type data structures.
+
+// Their contents must be kept in sync with their definitions.
+// Because the host and target type sizes can differ, the compiler and
+// linker cannot use the host information that they might get from
+// either unsafe.Sizeof and Alignof, nor runtime, reflect, or reflectlite.
+
+// CommonSize returns sizeof(Type) for a compilation target with a given ptrSize
+func CommonSize(ptrSize int) int { return 4*ptrSize + 8 + 8 }
+
+// StructFieldSize returns sizeof(StructField) for a compilation target with a given ptrSize
+func StructFieldSize(ptrSize int) int { return 3 * ptrSize }
+
+// UncommonSize returns sizeof(UncommonType).  This currently does not depend on ptrSize.
+// This exported function is in an internal package, so it may change to depend on ptrSize in the future.
+func UncommonSize() uint64 { return 4 + 2 + 2 + 4 + 4 }
+
+// TFlagOff returns the offset of Type.TFlag for a compilation target with a given ptrSize
+func TFlagOff(ptrSize int) int { return 2*ptrSize + 4 }
+
+// ITabTypeOff returns the offset of ITab.Type for a compilation target with a given ptrSize
+func ITabTypeOff(ptrSize int) int { return ptrSize }
--- a/src/internal/abi/escape.go
+++ b/src/internal/abi/escape.go
@@ -0,0 +1,33 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+import "unsafe"
+
+// NoEscape hides the pointer p from escape analysis, preventing it
+// from escaping to the heap. It compiles down to nothing.
+//
+// WARNING: This is very subtle to use correctly. The caller must
+// ensure that it's truly safe for p to not escape to the heap by
+// maintaining runtime pointer invariants (for example, that globals
+// and the heap may not generally point into a stack).
+//
+//go:nosplit
+//go:nocheckptr
+func NoEscape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
+
+var alwaysFalse bool
+var escapeSink any
+
+// Escape forces any pointers in x to escape to the heap.
+func Escape[T any](x T) T {
+	if alwaysFalse {
+		escapeSink = x
+	}
+	return x
+}
--- a/src/internal/abi/export_test.go
+++ b/src/internal/abi/export_test.go
@@ -0,0 +1,14 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+func FuncPCTestFn()
+
+var FuncPCTestFnAddr uintptr // address of FuncPCTestFn, directly retrieved from assembly
+
+//go:noinline
+func FuncPCTest() uintptr {
+	return FuncPCABI0(FuncPCTestFn)
+}
--- a/src/internal/abi/funcpc.go
+++ b/src/internal/abi/funcpc.go
@@ -0,0 +1,31 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !gccgo
+
+package abi
+
+// FuncPC* intrinsics.
+//
+// CAREFUL: In programs with plugins, FuncPC* can return different values
+// for the same function (because there are actually multiple copies of
+// the same function in the address space). To be safe, don't use the
+// results of this function in any == expression. It is only safe to
+// use the result as an address at which to start executing code.
+
+// FuncPCABI0 returns the entry PC of the function f, which must be a
+// direct reference of a function defined as ABI0. Otherwise it is a
+// compile-time error.
+//
+// Implemented as a compile intrinsic.
+func FuncPCABI0(f interface{}) uintptr
+
+// FuncPCABIInternal returns the entry PC of the function f. If f is a
+// direct reference of a function, it must be defined as ABIInternal.
+// Otherwise it is a compile-time error. If f is not a direct reference
+// of a defined function, it assumes that f is a func value. Otherwise
+// the behavior is undefined.
+//
+// Implemented as a compile intrinsic.
+func FuncPCABIInternal(f interface{}) uintptr
--- a/src/internal/abi/funcpc_gccgo.go
+++ b/src/internal/abi/funcpc_gccgo.go
@@ -0,0 +1,21 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// For bootstrapping with gccgo.
+
+//go:build gccgo
+
+package abi
+
+import "unsafe"
+
+func FuncPCABI0(f interface{}) uintptr {
+	words := (*[2]unsafe.Pointer)(unsafe.Pointer(&f))
+	return *(*uintptr)(unsafe.Pointer(words[1]))
+}
+
+func FuncPCABIInternal(f interface{}) uintptr {
+	words := (*[2]unsafe.Pointer)(unsafe.Pointer(&f))
+	return *(*uintptr)(unsafe.Pointer(words[1]))
+}
--- a/src/internal/abi/iface.go
+++ b/src/internal/abi/iface.go
@@ -0,0 +1,27 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+import "unsafe"
+
+// The first word of every non-empty interface type contains an *ITab.
+// It records the underlying concrete type (Type), the interface type it
+// is implementing (Inter), and some ancillary information.
+//
+// allocated in non-garbage-collected memory
+type ITab struct {
+	Inter *InterfaceType
+	Type  *Type
+	Hash  uint32     // copy of Type.Hash. Used for type switches.
+	Fun   [1]uintptr // variable sized. fun[0]==0 means Type does not implement Inter.
+}
+
+// EmptyInterface describes the layout of a "interface{}" or a "any."
+// These are represented differently than non-empty interface, as the first
+// word always points to an abi.Type.
+type EmptyInterface struct {
+	Type *Type
+	Data unsafe.Pointer
+}
--- a/src/internal/abi/map.go
+++ b/src/internal/abi/map.go
@@ -0,0 +1,19 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+// Map constants common to several packages
+// runtime/runtime-gdb.py:MapTypePrinter contains its own copy
+const (
+	// Maximum number of key/elem pairs a bucket can hold.
+	MapBucketCountBits = 3 // log2 of number of elements in a bucket.
+	MapBucketCount     = 1 << MapBucketCountBits
+
+	// Maximum key or elem size to keep inline (instead of mallocing per element).
+	// Must fit in a uint8.
+	// Note: fast map functions cannot handle big elems (bigger than MapMaxElemBytes).
+	MapMaxKeyBytes  = 128
+	MapMaxElemBytes = 128 // Must fit in a uint8.
+)
--- a/src/internal/abi/rangefuncconsts.go
+++ b/src/internal/abi/rangefuncconsts.go
@@ -0,0 +1,18 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+type RF_State int
+
+// These constants are shared between the compiler, which uses them for state functions
+// and panic indicators, and the runtime, which turns them into more meaningful strings
+// For best code generation, RF_DONE and RF_READY should be 0 and 1.
+const (
+	RF_DONE          = RF_State(iota) // body of loop has exited in a non-panic way
+	RF_READY                          // body of loop has not exited yet, is not running  -- this is not a panic index
+	RF_PANIC                          // body of loop is either currently running, or has panicked
+	RF_EXHAUSTED                      // iterator function return, i.e., sequence is "exhausted"
+	RF_MISSING_PANIC = 4              // body of loop panicked but iterator function defer-recovered it away
+)
--- a/src/internal/abi/runtime.go
+++ b/src/internal/abi/runtime.go
@@ -0,0 +1,8 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+// ZeroValSize is the size in bytes of runtime.zeroVal.
+const ZeroValSize = 1024
--- a/src/internal/abi/stack.go
+++ b/src/internal/abi/stack.go
@@ -0,0 +1,33 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+const (
+	// StackNosplitBase is the base maximum number of bytes that a chain of
+	// NOSPLIT functions can use.
+	//
+	// This value must be multiplied by the stack guard multiplier, so do not
+	// use it directly. See runtime/stack.go:stackNosplit and
+	// cmd/internal/objabi/stack.go:StackNosplit.
+	StackNosplitBase = 800
+
+	// We have three different sequences for stack bounds checks, depending on
+	// whether the stack frame of a function is small, big, or huge.
+
+	// After a stack split check the SP is allowed to be StackSmall bytes below
+	// the stack guard.
+	//
+	// Functions that need frames <= StackSmall can perform the stack check
+	// using a single comparison directly between the stack guard and the SP
+	// because we ensure that StackSmall bytes of stack space are available
+	// beyond the stack guard.
+	StackSmall = 128
+
+	// Functions that need frames <= StackBig can assume that neither
+	// SP-framesize nor stackGuard-StackSmall will underflow, and thus use a
+	// more efficient check. In order to ensure this, StackBig must be <= the
+	// size of the unmapped space at zero.
+	StackBig = 4096
+)
--- a/src/internal/abi/stub.s
+++ b/src/internal/abi/stub.s
@@ -0,0 +1,7 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file silences errors about body-less functions
+// that are provided by intrinsics in the latest version of the compiler,
+// but may not be known to the bootstrap compiler.
--- a/src/internal/abi/switch.go
+++ b/src/internal/abi/switch.go
@@ -0,0 +1,61 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+type InterfaceSwitch struct {
+	Cache  *InterfaceSwitchCache
+	NCases int
+
+	// Array of NCases elements.
+	// Each case must be a non-empty interface type.
+	Cases [1]*InterfaceType
+}
+
+type InterfaceSwitchCache struct {
+	Mask    uintptr                      // mask for index. Must be a power of 2 minus 1
+	Entries [1]InterfaceSwitchCacheEntry // Mask+1 entries total
+}
+
+type InterfaceSwitchCacheEntry struct {
+	// type of source value (a *Type)
+	Typ uintptr
+	// case # to dispatch to
+	Case int
+	// itab to use for resulting case variable (a *runtime.itab)
+	Itab uintptr
+}
+
+const go122InterfaceSwitchCache = true
+
+func UseInterfaceSwitchCache(goarch string) bool {
+	if !go122InterfaceSwitchCache {
+		return false
+	}
+	// We need an atomic load instruction to make the cache multithreaded-safe.
+	// (AtomicLoadPtr needs to be implemented in cmd/compile/internal/ssa/_gen/ARCH.rules.)
+	switch goarch {
+	case "amd64", "arm64", "loong64", "mips", "mipsle", "mips64", "mips64le", "ppc64", "ppc64le", "riscv64", "s390x":
+		return true
+	default:
+		return false
+	}
+}
+
+type TypeAssert struct {
+	Cache   *TypeAssertCache
+	Inter   *InterfaceType
+	CanFail bool
+}
+type TypeAssertCache struct {
+	Mask    uintptr
+	Entries [1]TypeAssertCacheEntry
+}
+type TypeAssertCacheEntry struct {
+	// type of source value (a *runtime._type)
+	Typ uintptr
+	// itab to use for result (a *runtime.itab)
+	// nil if CanFail is set and conversion would fail.
+	Itab uintptr
+}
--- a/src/internal/abi/symtab.go
+++ b/src/internal/abi/symtab.go
@@ -0,0 +1,111 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+// A FuncFlag records bits about a function, passed to the runtime.
+type FuncFlag uint8
+
+const (
+	// FuncFlagTopFrame indicates a function that appears at the top of its stack.
+	// The traceback routine stop at such a function and consider that a
+	// successful, complete traversal of the stack.
+	// Examples of TopFrame functions include goexit, which appears
+	// at the top of a user goroutine stack, and mstart, which appears
+	// at the top of a system goroutine stack.
+	FuncFlagTopFrame FuncFlag = 1 << iota
+
+	// FuncFlagSPWrite indicates a function that writes an arbitrary value to SP
+	// (any write other than adding or subtracting a constant amount).
+	// The traceback routines cannot encode such changes into the
+	// pcsp tables, so the function traceback cannot safely unwind past
+	// SPWrite functions. Stopping at an SPWrite function is considered
+	// to be an incomplete unwinding of the stack. In certain contexts
+	// (in particular garbage collector stack scans) that is a fatal error.
+	FuncFlagSPWrite
+
+	// FuncFlagAsm indicates that a function was implemented in assembly.
+	FuncFlagAsm
+)
+
+// A FuncID identifies particular functions that need to be treated
+// specially by the runtime.
+// Note that in some situations involving plugins, there may be multiple
+// copies of a particular special runtime function.
+type FuncID uint8
+
+const (
+	// If you add a FuncID, you probably also want to add an entry to the map in
+	// ../../cmd/internal/objabi/funcid.go
+
+	FuncIDNormal FuncID = iota // not a special function
+	FuncID_abort
+	FuncID_asmcgocall
+	FuncID_asyncPreempt
+	FuncID_cgocallback
+	FuncID_corostart
+	FuncID_debugCallV2
+	FuncID_gcBgMarkWorker
+	FuncID_goexit
+	FuncID_gogo
+	FuncID_gopanic
+	FuncID_handleAsyncEvent
+	FuncID_mcall
+	FuncID_morestack
+	FuncID_mstart
+	FuncID_panicwrap
+	FuncID_rt0_go
+	FuncID_runfinq
+	FuncID_runtime_main
+	FuncID_sigpanic
+	FuncID_systemstack
+	FuncID_systemstack_switch
+	FuncIDWrapper // any autogenerated code (hash/eq algorithms, method wrappers, etc.)
+)
+
+// ArgsSizeUnknown is set in Func.argsize to mark all functions
+// whose argument size is unknown (C vararg functions, and
+// assembly code without an explicit specification).
+// This value is generated by the compiler, assembler, or linker.
+const ArgsSizeUnknown = -0x80000000
+
+// IDs for PCDATA and FUNCDATA tables in Go binaries.
+//
+// These must agree with ../../../runtime/funcdata.h.
+const (
+	PCDATA_UnsafePoint   = 0
+	PCDATA_StackMapIndex = 1
+	PCDATA_InlTreeIndex  = 2
+	PCDATA_ArgLiveIndex  = 3
+
+	FUNCDATA_ArgsPointerMaps    = 0
+	FUNCDATA_LocalsPointerMaps  = 1
+	FUNCDATA_StackObjects       = 2
+	FUNCDATA_InlTree            = 3
+	FUNCDATA_OpenCodedDeferInfo = 4
+	FUNCDATA_ArgInfo            = 5
+	FUNCDATA_ArgLiveInfo        = 6
+	FUNCDATA_WrapInfo           = 7
+)
+
+// Special values for the PCDATA_UnsafePoint table.
+const (
+	UnsafePointSafe   = -1 // Safe for async preemption
+	UnsafePointUnsafe = -2 // Unsafe for async preemption
+
+	// UnsafePointRestart1(2) apply on a sequence of instructions, within
+	// which if an async preemption happens, we should back off the PC
+	// to the start of the sequence when resuming.
+	// We need two so we can distinguish the start/end of the sequence
+	// in case that two sequences are next to each other.
+	UnsafePointRestart1 = -3
+	UnsafePointRestart2 = -4
+
+	// Like UnsafePointRestart1, but back to function entry if async preempted.
+	UnsafePointRestartAtEntry = -5
+)
+
+const MINFUNC = 16 // minimum size for a function
+
+const FuncTabBucketSize = 256 * MINFUNC // size of bucket in the pc->func lookup table
--- a/src/internal/abi/testdata/x.go
+++ b/src/internal/abi/testdata/x.go
@@ -0,0 +1,22 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package x
+
+import "internal/abi"
+
+func Fn0() // defined in assembly
+
+func Fn1() {}
+
+var FnExpr func()
+
+func test() {
+	_ = abi.FuncPCABI0(Fn0)           // line 16, no error
+	_ = abi.FuncPCABIInternal(Fn0)    // line 17, error
+	_ = abi.FuncPCABI0(Fn1)           // line 18, error
+	_ = abi.FuncPCABIInternal(Fn1)    // line 19, no error
+	_ = abi.FuncPCABI0(FnExpr)        // line 20, error
+	_ = abi.FuncPCABIInternal(FnExpr) // line 21, no error
+}
--- a/src/internal/abi/testdata/x.s
+++ b/src/internal/abi/testdata/x.s
@@ -0,0 +1,6 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+TEXT	·Fn0(SB), 0, $0-0
+	RET
--- a/src/internal/abi/type.go
+++ b/src/internal/abi/type.go
@@ -0,0 +1,803 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package abi
+
+import (
+	"unsafe"
+)
+
+// Type is the runtime representation of a Go type.
+//
+// Be careful about accessing this type at build time, as the version
+// of this type in the compiler/linker may not have the same layout
+// as the version in the target binary, due to pointer width
+// differences and any experiments. Use cmd/compile/internal/rttype
+// or the functions in compiletype.go to access this type instead.
+// (TODO: this admonition applies to every type in this package.
+// Put it in some shared location?)
+type Type struct {
+	Size_       uintptr
+	PtrBytes    uintptr // number of (prefix) bytes in the type that can contain pointers
+	Hash        uint32  // hash of type; avoids computation in hash tables
+	TFlag       TFlag   // extra type information flags
+	Align_      uint8   // alignment of variable with this type
+	FieldAlign_ uint8   // alignment of struct field with this type
+	Kind_       Kind    // enumeration for C
+	// function for comparing objects of this type
+	// (ptr to object A, ptr to object B) -> ==?
+	Equal func(unsafe.Pointer, unsafe.Pointer) bool
+	// GCData stores the GC type data for the garbage collector.
+	// If the KindGCProg bit is set in kind, GCData is a GC program.
+	// Otherwise it is a ptrmask bitmap. See mbitmap.go for details.
+	GCData    *byte
+	Str       NameOff // string form
+	PtrToThis TypeOff // type for pointer to this type, may be zero
+}
+
+// A Kind represents the specific kind of type that a Type represents.
+// The zero Kind is not a valid kind.
+type Kind uint8
+
+const (
+	Invalid Kind = iota
+	Bool
+	Int
+	Int8
+	Int16
+	Int32
+	Int64
+	Uint
+	Uint8
+	Uint16
+	Uint32
+	Uint64
+	Uintptr
+	Float32
+	Float64
+	Complex64
+	Complex128
+	Array
+	Chan
+	Func
+	Interface
+	Map
+	Pointer
+	Slice
+	String
+	Struct
+	UnsafePointer
+)
+
+const (
+	// TODO (khr, drchase) why aren't these in TFlag?  Investigate, fix if possible.
+	KindDirectIface Kind = 1 << 5
+	KindGCProg      Kind = 1 << 6 // Type.gc points to GC program
+	KindMask        Kind = (1 << 5) - 1
+)
+
+// TFlag is used by a Type to signal what extra type information is
+// available in the memory directly following the Type value.
+type TFlag uint8
+
+const (
+	// TFlagUncommon means that there is a data with a type, UncommonType,
+	// just beyond the shared-per-type common data.  That is, the data
+	// for struct types will store their UncommonType at one offset, the
+	// data for interface types will store their UncommonType at a different
+	// offset.  UncommonType is always accessed via a pointer that is computed
+	// using trust-us-we-are-the-implementors pointer arithmetic.
+	//
+	// For example, if t.Kind() == Struct and t.tflag&TFlagUncommon != 0,
+	// then t has UncommonType data and it can be accessed as:
+	//
+	//	type structTypeUncommon struct {
+	//		structType
+	//		u UncommonType
+	//	}
+	//	u := &(*structTypeUncommon)(unsafe.Pointer(t)).u
+	TFlagUncommon TFlag = 1 << 0
+
+	// TFlagExtraStar means the name in the str field has an
+	// extraneous '*' prefix. This is because for most types T in
+	// a program, the type *T also exists and reusing the str data
+	// saves binary size.
+	TFlagExtraStar TFlag = 1 << 1
+
+	// TFlagNamed means the type has a name.
+	TFlagNamed TFlag = 1 << 2
+
+	// TFlagRegularMemory means that equal and hash functions can treat
+	// this type as a single region of t.size bytes.
+	TFlagRegularMemory TFlag = 1 << 3
+
+	// TFlagUnrolledBitmap marks special types that are unrolled-bitmap
+	// versions of types with GC programs.
+	// These types need to be deallocated when the underlying object
+	// is freed.
+	TFlagUnrolledBitmap TFlag = 1 << 4
+)
+
+// NameOff is the offset to a name from moduledata.types.  See resolveNameOff in runtime.
+type NameOff int32
+
+// TypeOff is the offset to a type from moduledata.types.  See resolveTypeOff in runtime.
+type TypeOff int32
+
+// TextOff is an offset from the top of a text section.  See (rtype).textOff in runtime.
+type TextOff int32
+
+// String returns the name of k.
+func (k Kind) String() string {
+	if int(k) < len(kindNames) {
+		return kindNames[k]
+	}
+	return kindNames[0]
+}
+
+var kindNames = []string{
+	Invalid:       "invalid",
+	Bool:          "bool",
+	Int:           "int",
+	Int8:          "int8",
+	Int16:         "int16",
+	Int32:         "int32",
+	Int64:         "int64",
+	Uint:          "uint",
+	Uint8:         "uint8",
+	Uint16:        "uint16",
+	Uint32:        "uint32",
+	Uint64:        "uint64",
+	Uintptr:       "uintptr",
+	Float32:       "float32",
+	Float64:       "float64",
+	Complex64:     "complex64",
+	Complex128:    "complex128",
+	Array:         "array",
+	Chan:          "chan",
+	Func:          "func",
+	Interface:     "interface",
+	Map:           "map",
+	Pointer:       "ptr",
+	Slice:         "slice",
+	String:        "string",
+	Struct:        "struct",
+	UnsafePointer: "unsafe.Pointer",
+}
+
+// TypeOf returns the abi.Type of some value.
+func TypeOf(a any) *Type {
+	eface := *(*EmptyInterface)(unsafe.Pointer(&a))
+	// Types are either static (for compiler-created types) or
+	// heap-allocated but always reachable (for reflection-created
+	// types, held in the central map). So there is no need to
+	// escape types. noescape here help avoid unnecessary escape
+	// of v.
+	return (*Type)(NoEscape(unsafe.Pointer(eface.Type)))
+}
+
+// TypeFor returns the abi.Type for a type parameter.
+func TypeFor[T any]() *Type {
+	var v T
+	if t := TypeOf(v); t != nil {
+		return t // optimize for T being a non-interface kind
+	}
+	return TypeOf((*T)(nil)).Elem() // only for an interface kind
+}
+
+func (t *Type) Kind() Kind { return t.Kind_ & KindMask }
+
+func (t *Type) HasName() bool {
+	return t.TFlag&TFlagNamed != 0
+}
+
+// Pointers reports whether t contains pointers.
+func (t *Type) Pointers() bool { return t.PtrBytes != 0 }
+
+// IfaceIndir reports whether t is stored indirectly in an interface value.
+func (t *Type) IfaceIndir() bool {
+	return t.Kind_&KindDirectIface == 0
+}
+
+// isDirectIface reports whether t is stored directly in an interface value.
+func (t *Type) IsDirectIface() bool {
+	return t.Kind_&KindDirectIface != 0
+}
+
+func (t *Type) GcSlice(begin, end uintptr) []byte {
+	return unsafe.Slice(t.GCData, int(end))[begin:]
+}
+
+// Method on non-interface type
+type Method struct {
+	Name NameOff // name of method
+	Mtyp TypeOff // method type (without receiver)
+	Ifn  TextOff // fn used in interface call (one-word receiver)
+	Tfn  TextOff // fn used for normal method call
+}
+
+// UncommonType is present only for defined types or types with methods
+// (if T is a defined type, the uncommonTypes for T and *T have methods).
+// Using a pointer to this struct reduces the overall size required
+// to describe a non-defined type with no methods.
+type UncommonType struct {
+	PkgPath NameOff // import path; empty for built-in types like int, string
+	Mcount  uint16  // number of methods
+	Xcount  uint16  // number of exported methods
+	Moff    uint32  // offset from this uncommontype to [mcount]Method
+	_       uint32  // unused
+}
+
+func (t *UncommonType) Methods() []Method {
+	if t.Mcount == 0 {
+		return nil
+	}
+	return (*[1 << 16]Method)(addChecked(unsafe.Pointer(t), uintptr(t.Moff), "t.mcount > 0"))[:t.Mcount:t.Mcount]
+}
+
+func (t *UncommonType) ExportedMethods() []Method {
+	if t.Xcount == 0 {
+		return nil
+	}
+	return (*[1 << 16]Method)(addChecked(unsafe.Pointer(t), uintptr(t.Moff), "t.xcount > 0"))[:t.Xcount:t.Xcount]
+}
+
+// addChecked returns p+x.
+//
+// The whySafe string is ignored, so that the function still inlines
+// as efficiently as p+x, but all call sites should use the string to
+// record why the addition is safe, which is to say why the addition
+// does not cause x to advance to the very end of p's allocation
+// and therefore point incorrectly at the next block in memory.
+func addChecked(p unsafe.Pointer, x uintptr, whySafe string) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+// Imethod represents a method on an interface type
+type Imethod struct {
+	Name NameOff // name of method
+	Typ  TypeOff // .(*FuncType) underneath
+}
+
+// ArrayType represents a fixed array type.
+type ArrayType struct {
+	Type
+	Elem  *Type // array element type
+	Slice *Type // slice type
+	Len   uintptr
+}
+
+// Len returns the length of t if t is an array type, otherwise 0
+func (t *Type) Len() int {
+	if t.Kind() == Array {
+		return int((*ArrayType)(unsafe.Pointer(t)).Len)
+	}
+	return 0
+}
+
+func (t *Type) Common() *Type {
+	return t
+}
+
+type ChanDir int
+
+const (
+	RecvDir    ChanDir = 1 << iota         // <-chan
+	SendDir                                // chan<-
+	BothDir            = RecvDir | SendDir // chan
+	InvalidDir ChanDir = 0
+)
+
+// ChanType represents a channel type
+type ChanType struct {
+	Type
+	Elem *Type
+	Dir  ChanDir
+}
+
+type structTypeUncommon struct {
+	StructType
+	u UncommonType
+}
+
+// ChanDir returns the direction of t if t is a channel type, otherwise InvalidDir (0).
+func (t *Type) ChanDir() ChanDir {
+	if t.Kind() == Chan {
+		ch := (*ChanType)(unsafe.Pointer(t))
+		return ch.Dir
+	}
+	return InvalidDir
+}
+
+// Uncommon returns a pointer to T's "uncommon" data if there is any, otherwise nil
+func (t *Type) Uncommon() *UncommonType {
+	if t.TFlag&TFlagUncommon == 0 {
+		return nil
+	}
+	switch t.Kind() {
+	case Struct:
+		return &(*structTypeUncommon)(unsafe.Pointer(t)).u
+	case Pointer:
+		type u struct {
+			PtrType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case Func:
+		type u struct {
+			FuncType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case Slice:
+		type u struct {
+			SliceType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case Array:
+		type u struct {
+			ArrayType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case Chan:
+		type u struct {
+			ChanType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case Map:
+		type u struct {
+			MapType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case Interface:
+		type u struct {
+			InterfaceType
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	default:
+		type u struct {
+			Type
+			u UncommonType
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	}
+}
+
+// Elem returns the element type for t if t is an array, channel, map, pointer, or slice, otherwise nil.
+func (t *Type) Elem() *Type {
+	switch t.Kind() {
+	case Array:
+		tt := (*ArrayType)(unsafe.Pointer(t))
+		return tt.Elem
+	case Chan:
+		tt := (*ChanType)(unsafe.Pointer(t))
+		return tt.Elem
+	case Map:
+		tt := (*MapType)(unsafe.Pointer(t))
+		return tt.Elem
+	case Pointer:
+		tt := (*PtrType)(unsafe.Pointer(t))
+		return tt.Elem
+	case Slice:
+		tt := (*SliceType)(unsafe.Pointer(t))
+		return tt.Elem
+	}
+	return nil
+}
+
+// StructType returns t cast to a *StructType, or nil if its tag does not match.
+func (t *Type) StructType() *StructType {
+	if t.Kind() != Struct {
+		return nil
+	}
+	return (*StructType)(unsafe.Pointer(t))
+}
+
+// MapType returns t cast to a *MapType, or nil if its tag does not match.
+func (t *Type) MapType() *MapType {
+	if t.Kind() != Map {
+		return nil
+	}
+	return (*MapType)(unsafe.Pointer(t))
+}
+
+// ArrayType returns t cast to a *ArrayType, or nil if its tag does not match.
+func (t *Type) ArrayType() *ArrayType {
+	if t.Kind() != Array {
+		return nil
+	}
+	return (*ArrayType)(unsafe.Pointer(t))
+}
+
+// FuncType returns t cast to a *FuncType, or nil if its tag does not match.
+func (t *Type) FuncType() *FuncType {
+	if t.Kind() != Func {
+		return nil
+	}
+	return (*FuncType)(unsafe.Pointer(t))
+}
+
+// InterfaceType returns t cast to a *InterfaceType, or nil if its tag does not match.
+func (t *Type) InterfaceType() *InterfaceType {
+	if t.Kind() != Interface {
+		return nil
+	}
+	return (*InterfaceType)(unsafe.Pointer(t))
+}
+
+// Size returns the size of data with type t.
+func (t *Type) Size() uintptr { return t.Size_ }
+
+// Align returns the alignment of data with type t.
+func (t *Type) Align() int { return int(t.Align_) }
+
+func (t *Type) FieldAlign() int { return int(t.FieldAlign_) }
+
+type InterfaceType struct {
+	Type
+	PkgPath Name      // import path
+	Methods []Imethod // sorted by hash
+}
+
+func (t *Type) ExportedMethods() []Method {
+	ut := t.Uncommon()
+	if ut == nil {
+		return nil
+	}
+	return ut.ExportedMethods()
+}
+
+func (t *Type) NumMethod() int {
+	if t.Kind() == Interface {
+		tt := (*InterfaceType)(unsafe.Pointer(t))
+		return tt.NumMethod()
+	}
+	return len(t.ExportedMethods())
+}
+
+// NumMethod returns the number of interface methods in the type's method set.
+func (t *InterfaceType) NumMethod() int { return len(t.Methods) }
+
+type MapType struct {
+	Type
+	Key    *Type
+	Elem   *Type
+	Bucket *Type // internal type representing a hash bucket
+	// function for hashing keys (ptr to key, seed) -> hash
+	Hasher     func(unsafe.Pointer, uintptr) uintptr
+	KeySize    uint8  // size of key slot
+	ValueSize  uint8  // size of elem slot
+	BucketSize uint16 // size of bucket
+	Flags      uint32
+}
+
+// Note: flag values must match those used in the TMAP case
+// in ../cmd/compile/internal/reflectdata/reflect.go:writeType.
+func (mt *MapType) IndirectKey() bool { // store ptr to key instead of key itself
+	return mt.Flags&1 != 0
+}
+func (mt *MapType) IndirectElem() bool { // store ptr to elem instead of elem itself
+	return mt.Flags&2 != 0
+}
+func (mt *MapType) ReflexiveKey() bool { // true if k==k for all keys
+	return mt.Flags&4 != 0
+}
+func (mt *MapType) NeedKeyUpdate() bool { // true if we need to update key on an overwrite
+	return mt.Flags&8 != 0
+}
+func (mt *MapType) HashMightPanic() bool { // true if hash function might panic
+	return mt.Flags&16 != 0
+}
+
+func (t *Type) Key() *Type {
+	if t.Kind() == Map {
+		return (*MapType)(unsafe.Pointer(t)).Key
+	}
+	return nil
+}
+
+type SliceType struct {
+	Type
+	Elem *Type // slice element type
+}
+
+// funcType represents a function type.
+//
+// A *Type for each in and out parameter is stored in an array that
+// directly follows the funcType (and possibly its uncommonType). So
+// a function type with one method, one input, and one output is:
+//
+//	struct {
+//		funcType
+//		uncommonType
+//		[2]*rtype    // [0] is in, [1] is out
+//	}
+type FuncType struct {
+	Type
+	InCount  uint16
+	OutCount uint16 // top bit is set if last input parameter is ...
+}
+
+func (t *FuncType) In(i int) *Type {
+	return t.InSlice()[i]
+}
+
+func (t *FuncType) NumIn() int {
+	return int(t.InCount)
+}
+
+func (t *FuncType) NumOut() int {
+	return int(t.OutCount & (1<<15 - 1))
+}
+
+func (t *FuncType) Out(i int) *Type {
+	return (t.OutSlice()[i])
+}
+
+func (t *FuncType) InSlice() []*Type {
+	uadd := unsafe.Sizeof(*t)
+	if t.TFlag&TFlagUncommon != 0 {
+		uadd += unsafe.Sizeof(UncommonType{})
+	}
+	if t.InCount == 0 {
+		return nil
+	}
+	return (*[1 << 16]*Type)(addChecked(unsafe.Pointer(t), uadd, "t.inCount > 0"))[:t.InCount:t.InCount]
+}
+func (t *FuncType) OutSlice() []*Type {
+	outCount := uint16(t.NumOut())
+	if outCount == 0 {
+		return nil
+	}
+	uadd := unsafe.Sizeof(*t)
+	if t.TFlag&TFlagUncommon != 0 {
+		uadd += unsafe.Sizeof(UncommonType{})
+	}
+	return (*[1 << 17]*Type)(addChecked(unsafe.Pointer(t), uadd, "outCount > 0"))[t.InCount : t.InCount+outCount : t.InCount+outCount]
+}
+
+func (t *FuncType) IsVariadic() bool {
+	return t.OutCount&(1<<15) != 0
+}
+
+type PtrType struct {
+	Type
+	Elem *Type // pointer element (pointed at) type
+}
+
+type StructField struct {
+	Name   Name    // name is always non-empty
+	Typ    *Type   // type of field
+	Offset uintptr // byte offset of field
+}
+
+func (f *StructField) Embedded() bool {
+	return f.Name.IsEmbedded()
+}
+
+type StructType struct {
+	Type
+	PkgPath Name
+	Fields  []StructField
+}
+
+// Name is an encoded type Name with optional extra data.
+//
+// The first byte is a bit field containing:
+//
+//	1<<0 the name is exported
+//	1<<1 tag data follows the name
+//	1<<2 pkgPath nameOff follows the name and tag
+//	1<<3 the name is of an embedded (a.k.a. anonymous) field
+//
+// Following that, there is a varint-encoded length of the name,
+// followed by the name itself.
+//
+// If tag data is present, it also has a varint-encoded length
+// followed by the tag itself.
+//
+// If the import path follows, then 4 bytes at the end of
+// the data form a nameOff. The import path is only set for concrete
+// methods that are defined in a different package than their type.
+//
+// If a name starts with "*", then the exported bit represents
+// whether the pointed to type is exported.
+//
+// Note: this encoding must match here and in:
+//   cmd/compile/internal/reflectdata/reflect.go
+//   cmd/link/internal/ld/decodesym.go
+
+type Name struct {
+	Bytes *byte
+}
+
+// DataChecked does pointer arithmetic on n's Bytes, and that arithmetic is asserted to
+// be safe for the reason in whySafe (which can appear in a backtrace, etc.)
+func (n Name) DataChecked(off int, whySafe string) *byte {
+	return (*byte)(addChecked(unsafe.Pointer(n.Bytes), uintptr(off), whySafe))
+}
+
+// Data does pointer arithmetic on n's Bytes, and that arithmetic is asserted to
+// be safe because the runtime made the call (other packages use DataChecked)
+func (n Name) Data(off int) *byte {
+	return (*byte)(addChecked(unsafe.Pointer(n.Bytes), uintptr(off), "the runtime doesn't need to give you a reason"))
+}
+
+// IsExported returns "is n exported?"
+func (n Name) IsExported() bool {
+	return (*n.Bytes)&(1<<0) != 0
+}
+
+// HasTag returns true iff there is tag data following this name
+func (n Name) HasTag() bool {
+	return (*n.Bytes)&(1<<1) != 0
+}
+
+// IsEmbedded returns true iff n is embedded (an anonymous field).
+func (n Name) IsEmbedded() bool {
+	return (*n.Bytes)&(1<<3) != 0
+}
+
+// ReadVarint parses a varint as encoded by encoding/binary.
+// It returns the number of encoded bytes and the encoded value.
+func (n Name) ReadVarint(off int) (int, int) {
+	v := 0
+	for i := 0; ; i++ {
+		x := *n.DataChecked(off+i, "read varint")
+		v += int(x&0x7f) << (7 * i)
+		if x&0x80 == 0 {
+			return i + 1, v
+		}
+	}
+}
+
+// IsBlank indicates whether n is "_".
+func (n Name) IsBlank() bool {
+	if n.Bytes == nil {
+		return false
+	}
+	_, l := n.ReadVarint(1)
+	return l == 1 && *n.Data(2) == '_'
+}
+
+// writeVarint writes n to buf in varint form. Returns the
+// number of bytes written. n must be nonnegative.
+// Writes at most 10 bytes.
+func writeVarint(buf []byte, n int) int {
+	for i := 0; ; i++ {
+		b := byte(n & 0x7f)
+		n >>= 7
+		if n == 0 {
+			buf[i] = b
+			return i + 1
+		}
+		buf[i] = b | 0x80
+	}
+}
+
+// Name returns the tag string for n, or empty if there is none.
+func (n Name) Name() string {
+	if n.Bytes == nil {
+		return ""
+	}
+	i, l := n.ReadVarint(1)
+	return unsafe.String(n.DataChecked(1+i, "non-empty string"), l)
+}
+
+// Tag returns the tag string for n, or empty if there is none.
+func (n Name) Tag() string {
+	if !n.HasTag() {
+		return ""
+	}
+	i, l := n.ReadVarint(1)
+	i2, l2 := n.ReadVarint(1 + i + l)
+	return unsafe.String(n.DataChecked(1+i+l+i2, "non-empty string"), l2)
+}
+
+func NewName(n, tag string, exported, embedded bool) Name {
+	if len(n) >= 1<<29 {
+		panic("abi.NewName: name too long: " + n[:1024] + "...")
+	}
+	if len(tag) >= 1<<29 {
+		panic("abi.NewName: tag too long: " + tag[:1024] + "...")
+	}
+	var nameLen [10]byte
+	var tagLen [10]byte
+	nameLenLen := writeVarint(nameLen[:], len(n))
+	tagLenLen := writeVarint(tagLen[:], len(tag))
+
+	var bits byte
+	l := 1 + nameLenLen + len(n)
+	if exported {
+		bits |= 1 << 0
+	}
+	if len(tag) > 0 {
+		l += tagLenLen + len(tag)
+		bits |= 1 << 1
+	}
+	if embedded {
+		bits |= 1 << 3
+	}
+
+	b := make([]byte, l)
+	b[0] = bits
+	copy(b[1:], nameLen[:nameLenLen])
+	copy(b[1+nameLenLen:], n)
+	if len(tag) > 0 {
+		tb := b[1+nameLenLen+len(n):]
+		copy(tb, tagLen[:tagLenLen])
+		copy(tb[tagLenLen:], tag)
+	}
+
+	return Name{Bytes: &b[0]}
+}
+
+const (
+	TraceArgsLimit    = 10 // print no more than 10 args/components
+	TraceArgsMaxDepth = 5  // no more than 5 layers of nesting
+
+	// maxLen is a (conservative) upper bound of the byte stream length. For
+	// each arg/component, it has no more than 2 bytes of data (size, offset),
+	// and no more than one {, }, ... at each level (it cannot have both the
+	// data and ... unless it is the last one, just be conservative). Plus 1
+	// for _endSeq.
+	TraceArgsMaxLen = (TraceArgsMaxDepth*3+2)*TraceArgsLimit + 1
+)
+
+// Populate the data.
+// The data is a stream of bytes, which contains the offsets and sizes of the
+// non-aggregate arguments or non-aggregate fields/elements of aggregate-typed
+// arguments, along with special "operators". Specifically,
+//   - for each non-aggregate arg/field/element, its offset from FP (1 byte) and
+//     size (1 byte)
+//   - special operators:
+//   - 0xff - end of sequence
+//   - 0xfe - print { (at the start of an aggregate-typed argument)
+//   - 0xfd - print } (at the end of an aggregate-typed argument)
+//   - 0xfc - print ... (more args/fields/elements)
+//   - 0xfb - print _ (offset too large)
+const (
+	TraceArgsEndSeq         = 0xff
+	TraceArgsStartAgg       = 0xfe
+	TraceArgsEndAgg         = 0xfd
+	TraceArgsDotdotdot      = 0xfc
+	TraceArgsOffsetTooLarge = 0xfb
+	TraceArgsSpecial        = 0xf0 // above this are operators, below this are ordinary offsets
+)
+
+// MaxPtrmaskBytes is the maximum length of a GC ptrmask bitmap,
+// which holds 1-bit entries describing where pointers are in a given type.
+// Above this length, the GC information is recorded as a GC program,
+// which can express repetition compactly. In either form, the
+// information is used by the runtime to initialize the heap bitmap,
+// and for large types (like 128 or more words), they are roughly the
+// same speed. GC programs are never much larger and often more
+// compact. (If large arrays are involved, they can be arbitrarily
+// more compact.)
+//
+// The cutoff must be large enough that any allocation large enough to
+// use a GC program is large enough that it does not share heap bitmap
+// bytes with any other objects, allowing the GC program execution to
+// assume an aligned start and not use atomic operations. In the current
+// runtime, this means all malloc size classes larger than the cutoff must
+// be multiples of four words. On 32-bit systems that's 16 bytes, and
+// all size classes >= 16 bytes are 16-byte aligned, so no real constraint.
+// On 64-bit systems, that's 32 bytes, and 32-byte alignment is guaranteed
+// for size classes >= 256 bytes. On a 64-bit system, 256 bytes allocated
+// is 32 pointers, the bits for which fit in 4 bytes. So MaxPtrmaskBytes
+// must be >= 4.
+//
+// We used to use 16 because the GC programs do have some constant overhead
+// to get started, and processing 128 pointers seems to be enough to
+// amortize that overhead well.
+//
+// To make sure that the runtime's chansend can call typeBitsBulkBarrier,
+// we raised the limit to 2048, so that even 32-bit systems are guaranteed to
+// use bitmaps for objects up to 64 kB in size.
+const MaxPtrmaskBytes = 2048
--- a/src/internal/asan/asan.go
+++ b/src/internal/asan/asan.go
@@ -0,0 +1,19 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build asan
+
+package asan
+
+import (
+	"unsafe"
+)
+
+const Enabled = true
+
+//go:linkname Read runtime.asanread
+func Read(addr unsafe.Pointer, len uintptr)
+
+//go:linkname Write runtime.asanwrite
+func Write(addr unsafe.Pointer, len uintptr)
--- a/src/internal/asan/doc.go
+++ b/src/internal/asan/doc.go
@@ -0,0 +1,10 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package asan contains helper functions for manually instrumenting
+// code for the address sanitizer.
+// The runtime package intentionally exports these functions only in the
+// asan build; this package exports them unconditionally but without the
+// "asan" build tag they are no-ops.
+package asan
--- a/src/internal/asan/noasan.go
+++ b/src/internal/asan/noasan.go
@@ -0,0 +1,17 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !asan
+
+package asan
+
+import (
+	"unsafe"
+)
+
+const Enabled = false
+
+func Read(addr unsafe.Pointer, len uintptr) {}
+
+func Write(addr unsafe.Pointer, len uintptr) {}
--- a/src/internal/bisect/bisect.go
+++ b/src/internal/bisect/bisect.go
@@ -0,0 +1,778 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package bisect can be used by compilers and other programs
+// to serve as a target for the bisect debugging tool.
+// See [golang.org/x/tools/cmd/bisect] for details about using the tool.
+//
+// To be a bisect target, allowing bisect to help determine which of a set of independent
+// changes provokes a failure, a program needs to:
+//
+//  1. Define a way to accept a change pattern on its command line or in its environment.
+//     The most common mechanism is a command-line flag.
+//     The pattern can be passed to [New] to create a [Matcher], the compiled form of a pattern.
+//
+//  2. Assign each change a unique ID. One possibility is to use a sequence number,
+//     but the most common mechanism is to hash some kind of identifying information
+//     like the file and line number where the change might be applied.
+//     [Hash] hashes its arguments to compute an ID.
+//
+//  3. Enable each change that the pattern says should be enabled.
+//     The [Matcher.ShouldEnable] method answers this question for a given change ID.
+//
+//  4. Print a report identifying each change that the pattern says should be printed.
+//     The [Matcher.ShouldPrint] method answers this question for a given change ID.
+//     The report consists of one more lines on standard error or standard output
+//     that contain a “match marker”. [Marker] returns the match marker for a given ID.
+//     When bisect reports a change as causing the failure, it identifies the change
+//     by printing the report lines with the match marker removed.
+//
+// # Example Usage
+//
+// A program starts by defining how it receives the pattern. In this example, we will assume a flag.
+// The next step is to compile the pattern:
+//
+//	m, err := bisect.New(patternFlag)
+//	if err != nil {
+//		log.Fatal(err)
+//	}
+//
+// Then, each time a potential change is considered, the program computes
+// a change ID by hashing identifying information (source file and line, in this case)
+// and then calls m.ShouldPrint and m.ShouldEnable to decide whether to
+// print and enable the change, respectively. The two can return different values
+// depending on whether bisect is trying to find a minimal set of changes to
+// disable or to enable to provoke the failure.
+//
+// It is usually helpful to write a helper function that accepts the identifying information
+// and then takes care of hashing, printing, and reporting whether the identified change
+// should be enabled. For example, a helper for changes identified by a file and line number
+// would be:
+//
+//	func ShouldEnable(file string, line int) {
+//		h := bisect.Hash(file, line)
+//		if m.ShouldPrint(h) {
+//			fmt.Fprintf(os.Stderr, "%v %s:%d\n", bisect.Marker(h), file, line)
+//		}
+//		return m.ShouldEnable(h)
+//	}
+//
+// Finally, note that New returns a nil Matcher when there is no pattern,
+// meaning that the target is not running under bisect at all,
+// so all changes should be enabled and none should be printed.
+// In that common case, the computation of the hash can be avoided entirely
+// by checking for m == nil first:
+//
+//	func ShouldEnable(file string, line int) bool {
+//		if m == nil {
+//			return true
+//		}
+//		h := bisect.Hash(file, line)
+//		if m.ShouldPrint(h) {
+//			fmt.Fprintf(os.Stderr, "%v %s:%d\n", bisect.Marker(h), file, line)
+//		}
+//		return m.ShouldEnable(h)
+//	}
+//
+// When the identifying information is expensive to format, this code can call
+// [Matcher.MarkerOnly] to find out whether short report lines containing only the
+// marker are permitted for a given run. (Bisect permits such lines when it is
+// still exploring the space of possible changes and will not be showing the
+// output to the user.) If so, the client can choose to print only the marker:
+//
+//	func ShouldEnable(file string, line int) bool {
+//		if m == nil {
+//			return true
+//		}
+//		h := bisect.Hash(file, line)
+//		if m.ShouldPrint(h) {
+//			if m.MarkerOnly() {
+//				bisect.PrintMarker(os.Stderr, h)
+//			} else {
+//				fmt.Fprintf(os.Stderr, "%v %s:%d\n", bisect.Marker(h), file, line)
+//			}
+//		}
+//		return m.ShouldEnable(h)
+//	}
+//
+// This specific helper – deciding whether to enable a change identified by
+// file and line number and printing about the change when necessary – is
+// provided by the [Matcher.FileLine] method.
+//
+// Another common usage is deciding whether to make a change in a function
+// based on the caller's stack, to identify the specific calling contexts that the
+// change breaks. The [Matcher.Stack] method takes care of obtaining the stack,
+// printing it when necessary, and reporting whether to enable the change
+// based on that stack.
+//
+// # Pattern Syntax
+//
+// Patterns are generated by the bisect tool and interpreted by [New].
+// Users should not have to understand the patterns except when
+// debugging a target's bisect support or debugging the bisect tool itself.
+//
+// The pattern syntax selecting a change is a sequence of bit strings
+// separated by + and - operators. Each bit string denotes the set of
+// changes with IDs ending in those bits, + is set addition, - is set subtraction,
+// and the expression is evaluated in the usual left-to-right order.
+// The special binary number “y” denotes the set of all changes,
+// standing in for the empty bit string.
+// In the expression, all the + operators must appear before all the - operators.
+// A leading + adds to an empty set. A leading - subtracts from the set of all
+// possible suffixes.
+//
+// For example:
+//
+//   - “01+10” and “+01+10” both denote the set of changes
+//     with IDs ending with the bits 01 or 10.
+//
+//   - “01+10-1001” denotes the set of changes with IDs
+//     ending with the bits 01 or 10, but excluding those ending in 1001.
+//
+//   - “-01-1000” and “y-01-1000 both denote the set of all changes
+//     with IDs not ending in 01 nor 1000.
+//
+//   - “0+1-01+001” is not a valid pattern, because all the + operators do not
+//     appear before all the - operators.
+//
+// In the syntaxes described so far, the pattern specifies the changes to
+// enable and report. If a pattern is prefixed by a “!”, the meaning
+// changes: the pattern specifies the changes to DISABLE and report. This
+// mode of operation is needed when a program passes with all changes
+// enabled but fails with no changes enabled. In this case, bisect
+// searches for minimal sets of changes to disable.
+// Put another way, the leading “!” inverts the result from [Matcher.ShouldEnable]
+// but does not invert the result from [Matcher.ShouldPrint].
+//
+// As a convenience for manual debugging, “n” is an alias for “!y”,
+// meaning to disable and report all changes.
+//
+// Finally, a leading “v” in the pattern indicates that the reports will be shown
+// to the user of bisect to describe the changes involved in a failure.
+// At the API level, the leading “v” causes [Matcher.Visible] to return true.
+// See the next section for details.
+//
+// # Match Reports
+//
+// The target program must enable only those changed matched
+// by the pattern, and it must print a match report for each such change.
+// A match report consists of one or more lines of text that will be
+// printed by the bisect tool to describe a change implicated in causing
+// a failure. Each line in the report for a given change must contain a
+// match marker with that change ID, as returned by [Marker].
+// The markers are elided when displaying the lines to the user.
+//
+// A match marker has the form “[bisect-match 0x1234]” where
+// 0x1234 is the change ID in hexadecimal.
+// An alternate form is “[bisect-match 010101]”, giving the change ID in binary.
+//
+// When [Matcher.Visible] returns false, the match reports are only
+// being processed by bisect to learn the set of enabled changes,
+// not shown to the user, meaning that each report can be a match
+// marker on a line by itself, eliding the usual textual description.
+// When the textual description is expensive to compute,
+// checking [Matcher.Visible] can help the avoid that expense
+// in most runs.
+package bisect
+
+import (
+	"runtime"
+	"sync"
+	"sync/atomic"
+)
+
+// New creates and returns a new Matcher implementing the given pattern.
+// The pattern syntax is defined in the package doc comment.
+//
+// In addition to the pattern syntax syntax, New("") returns nil, nil.
+// The nil *Matcher is valid for use: it returns true from ShouldEnable
+// and false from ShouldPrint for all changes. Callers can avoid calling
+// [Hash], [Matcher.ShouldEnable], and [Matcher.ShouldPrint] entirely
+// when they recognize the nil Matcher.
+func New(pattern string) (*Matcher, error) {
+	if pattern == "" {
+		return nil, nil
+	}
+
+	m := new(Matcher)
+
+	p := pattern
+	// Special case for leading 'q' so that 'qn' quietly disables, e.g. fmahash=qn to disable fma
+	// Any instance of 'v' disables 'q'.
+	if len(p) > 0 && p[0] == 'q' {
+		m.quiet = true
+		p = p[1:]
+		if p == "" {
+			return nil, &parseError{"invalid pattern syntax: " + pattern}
+		}
+	}
+	// Allow multiple v, so that “bisect cmd vPATTERN” can force verbose all the time.
+	for len(p) > 0 && p[0] == 'v' {
+		m.verbose = true
+		m.quiet = false
+		p = p[1:]
+		if p == "" {
+			return nil, &parseError{"invalid pattern syntax: " + pattern}
+		}
+	}
+
+	// Allow multiple !, each negating the last, so that “bisect cmd !PATTERN” works
+	// even when bisect chooses to add its own !.
+	m.enable = true
+	for len(p) > 0 && p[0] == '!' {
+		m.enable = !m.enable
+		p = p[1:]
+		if p == "" {
+			return nil, &parseError{"invalid pattern syntax: " + pattern}
+		}
+	}
+
+	if p == "n" {
+		// n is an alias for !y.
+		m.enable = !m.enable
+		p = "y"
+	}
+
+	// Parse actual pattern syntax.
+	result := true
+	bits := uint64(0)
+	start := 0
+	wid := 1 // 1-bit (binary); sometimes 4-bit (hex)
+	for i := 0; i <= len(p); i++ {
+		// Imagine a trailing - at the end of the pattern to flush final suffix
+		c := byte('-')
+		if i < len(p) {
+			c = p[i]
+		}
+		if i == start && wid == 1 && c == 'x' { // leading x for hex
+			start = i + 1
+			wid = 4
+			continue
+		}
+		switch c {
+		default:
+			return nil, &parseError{"invalid pattern syntax: " + pattern}
+		case '2', '3', '4', '5', '6', '7', '8', '9':
+			if wid != 4 {
+				return nil, &parseError{"invalid pattern syntax: " + pattern}
+			}
+			fallthrough
+		case '0', '1':
+			bits <<= wid
+			bits |= uint64(c - '0')
+		case 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F':
+			if wid != 4 {
+				return nil, &parseError{"invalid pattern syntax: " + pattern}
+			}
+			bits <<= 4
+			bits |= uint64(c&^0x20 - 'A' + 10)
+		case 'y':
+			if i+1 < len(p) && (p[i+1] == '0' || p[i+1] == '1') {
+				return nil, &parseError{"invalid pattern syntax: " + pattern}
+			}
+			bits = 0
+		case '+', '-':
+			if c == '+' && result == false {
+				// Have already seen a -. Should be - from here on.
+				return nil, &parseError{"invalid pattern syntax (+ after -): " + pattern}
+			}
+			if i > 0 {
+				n := (i - start) * wid
+				if n > 64 {
+					return nil, &parseError{"pattern bits too long: " + pattern}
+				}
+				if n <= 0 {
+					return nil, &parseError{"invalid pattern syntax: " + pattern}
+				}
+				if p[start] == 'y' {
+					n = 0
+				}
+				mask := uint64(1)<<n - 1
+				m.list = append(m.list, cond{mask, bits, result})
+			} else if c == '-' {
+				// leading - subtracts from complete set
+				m.list = append(m.list, cond{0, 0, true})
+			}
+			bits = 0
+			result = c == '+'
+			start = i + 1
+			wid = 1
+		}
+	}
+	return m, nil
+}
+
+// A Matcher is the parsed, compiled form of a PATTERN string.
+// The nil *Matcher is valid: it has all changes enabled but none reported.
+type Matcher struct {
+	verbose bool   // annotate reporting with human-helpful information
+	quiet   bool   // disables all reporting.  reset if verbose is true. use case is -d=fmahash=qn
+	enable  bool   // when true, list is for “enable and report” (when false, “disable and report”)
+	list    []cond // conditions; later ones win over earlier ones
+	dedup   atomic.Pointer[dedup]
+}
+
+// A cond is a single condition in the matcher.
+// Given an input id, if id&mask == bits, return the result.
+type cond struct {
+	mask   uint64
+	bits   uint64
+	result bool
+}
+
+// MarkerOnly reports whether it is okay to print only the marker for
+// a given change, omitting the identifying information.
+// MarkerOnly returns true when bisect is using the printed reports
+// only for an intermediate search step, not for showing to users.
+func (m *Matcher) MarkerOnly() bool {
+	return !m.verbose
+}
+
+// ShouldEnable reports whether the change with the given id should be enabled.
+func (m *Matcher) ShouldEnable(id uint64) bool {
+	if m == nil {
+		return true
+	}
+	return m.matchResult(id) == m.enable
+}
+
+// ShouldPrint reports whether to print identifying information about the change with the given id.
+func (m *Matcher) ShouldPrint(id uint64) bool {
+	if m == nil || m.quiet {
+		return false
+	}
+	return m.matchResult(id)
+}
+
+// matchResult returns the result from the first condition that matches id.
+func (m *Matcher) matchResult(id uint64) bool {
+	for i := len(m.list) - 1; i >= 0; i-- {
+		c := &m.list[i]
+		if id&c.mask == c.bits {
+			return c.result
+		}
+	}
+	return false
+}
+
+// FileLine reports whether the change identified by file and line should be enabled.
+// If the change should be printed, FileLine prints a one-line report to w.
+func (m *Matcher) FileLine(w Writer, file string, line int) bool {
+	if m == nil {
+		return true
+	}
+	return m.fileLine(w, file, line)
+}
+
+// fileLine does the real work for FileLine.
+// This lets FileLine's body handle m == nil and potentially be inlined.
+func (m *Matcher) fileLine(w Writer, file string, line int) bool {
+	h := Hash(file, line)
+	if m.ShouldPrint(h) {
+		if m.MarkerOnly() {
+			PrintMarker(w, h)
+		} else {
+			printFileLine(w, h, file, line)
+		}
+	}
+	return m.ShouldEnable(h)
+}
+
+// printFileLine prints a non-marker-only report for file:line to w.
+func printFileLine(w Writer, h uint64, file string, line int) error {
+	const markerLen = 40 // overestimate
+	b := make([]byte, 0, markerLen+len(file)+24)
+	b = AppendMarker(b, h)
+	b = appendFileLine(b, file, line)
+	b = append(b, '\n')
+	_, err := w.Write(b)
+	return err
+}
+
+// appendFileLine appends file:line to dst, returning the extended slice.
+func appendFileLine(dst []byte, file string, line int) []byte {
+	dst = append(dst, file...)
+	dst = append(dst, ':')
+	u := uint(line)
+	if line < 0 {
+		dst = append(dst, '-')
+		u = -u
+	}
+	var buf [24]byte
+	i := len(buf)
+	for i == len(buf) || u > 0 {
+		i--
+		buf[i] = '0' + byte(u%10)
+		u /= 10
+	}
+	dst = append(dst, buf[i:]...)
+	return dst
+}
+
+// MatchStack assigns the current call stack a change ID.
+// If the stack should be printed, MatchStack prints it.
+// Then MatchStack reports whether a change at the current call stack should be enabled.
+func (m *Matcher) Stack(w Writer) bool {
+	if m == nil {
+		return true
+	}
+	return m.stack(w)
+}
+
+// stack does the real work for Stack.
+// This lets stack's body handle m == nil and potentially be inlined.
+func (m *Matcher) stack(w Writer) bool {
+	const maxStack = 16
+	var stk [maxStack]uintptr
+	n := runtime.Callers(2, stk[:])
+	// caller #2 is not for printing; need it to normalize PCs if ASLR.
+	if n <= 1 {
+		return false
+	}
+
+	base := stk[0]
+	// normalize PCs
+	for i := range stk[:n] {
+		stk[i] -= base
+	}
+
+	h := Hash(stk[:n])
+	if m.ShouldPrint(h) {
+		var d *dedup
+		for {
+			d = m.dedup.Load()
+			if d != nil {
+				break
+			}
+			d = new(dedup)
+			if m.dedup.CompareAndSwap(nil, d) {
+				break
+			}
+		}
+
+		if m.MarkerOnly() {
+			if !d.seenLossy(h) {
+				PrintMarker(w, h)
+			}
+		} else {
+			if !d.seen(h) {
+				// Restore PCs in stack for printing
+				for i := range stk[:n] {
+					stk[i] += base
+				}
+				printStack(w, h, stk[1:n])
+			}
+		}
+	}
+	return m.ShouldEnable(h)
+}
+
+// Writer is the same interface as io.Writer.
+// It is duplicated here to avoid importing io.
+type Writer interface {
+	Write([]byte) (int, error)
+}
+
+// PrintMarker prints to w a one-line report containing only the marker for h.
+// It is appropriate to use when [Matcher.ShouldPrint] and [Matcher.MarkerOnly] both return true.
+func PrintMarker(w Writer, h uint64) error {
+	var buf [50]byte
+	b := AppendMarker(buf[:0], h)
+	b = append(b, '\n')
+	_, err := w.Write(b)
+	return err
+}
+
+// printStack prints to w a multi-line report containing a formatting of the call stack stk,
+// with each line preceded by the marker for h.
+func printStack(w Writer, h uint64, stk []uintptr) error {
+	buf := make([]byte, 0, 2048)
+
+	var prefixBuf [100]byte
+	prefix := AppendMarker(prefixBuf[:0], h)
+
+	frames := runtime.CallersFrames(stk)
+	for {
+		f, more := frames.Next()
+		buf = append(buf, prefix...)
+		buf = append(buf, f.Function...)
+		buf = append(buf, "()\n"...)
+		buf = append(buf, prefix...)
+		buf = append(buf, '\t')
+		buf = appendFileLine(buf, f.File, f.Line)
+		buf = append(buf, '\n')
+		if !more {
+			break
+		}
+	}
+	buf = append(buf, prefix...)
+	buf = append(buf, '\n')
+	_, err := w.Write(buf)
+	return err
+}
+
+// Marker returns the match marker text to use on any line reporting details
+// about a match of the given ID.
+// It always returns the hexadecimal format.
+func Marker(id uint64) string {
+	return string(AppendMarker(nil, id))
+}
+
+// AppendMarker is like [Marker] but appends the marker to dst.
+func AppendMarker(dst []byte, id uint64) []byte {
+	const prefix = "[bisect-match 0x"
+	var buf [len(prefix) + 16 + 1]byte
+	copy(buf[:], prefix)
+	for i := 0; i < 16; i++ {
+		buf[len(prefix)+i] = "0123456789abcdef"[id>>60]
+		id <<= 4
+	}
+	buf[len(prefix)+16] = ']'
+	return append(dst, buf[:]...)
+}
+
+// CutMarker finds the first match marker in line and removes it,
+// returning the shortened line (with the marker removed),
+// the ID from the match marker,
+// and whether a marker was found at all.
+// If there is no marker, CutMarker returns line, 0, false.
+func CutMarker(line string) (short string, id uint64, ok bool) {
+	// Find first instance of prefix.
+	prefix := "[bisect-match "
+	i := 0
+	for ; ; i++ {
+		if i >= len(line)-len(prefix) {
+			return line, 0, false
+		}
+		if line[i] == '[' && line[i:i+len(prefix)] == prefix {
+			break
+		}
+	}
+
+	// Scan to ].
+	j := i + len(prefix)
+	for j < len(line) && line[j] != ']' {
+		j++
+	}
+	if j >= len(line) {
+		return line, 0, false
+	}
+
+	// Parse id.
+	idstr := line[i+len(prefix) : j]
+	if len(idstr) >= 3 && idstr[:2] == "0x" {
+		// parse hex
+		if len(idstr) > 2+16 { // max 0x + 16 digits
+			return line, 0, false
+		}
+		for i := 2; i < len(idstr); i++ {
+			id <<= 4
+			switch c := idstr[i]; {
+			case '0' <= c && c <= '9':
+				id |= uint64(c - '0')
+			case 'a' <= c && c <= 'f':
+				id |= uint64(c - 'a' + 10)
+			case 'A' <= c && c <= 'F':
+				id |= uint64(c - 'A' + 10)
+			}
+		}
+	} else {
+		if idstr == "" || len(idstr) > 64 { // min 1 digit, max 64 digits
+			return line, 0, false
+		}
+		// parse binary
+		for i := 0; i < len(idstr); i++ {
+			id <<= 1
+			switch c := idstr[i]; c {
+			default:
+				return line, 0, false
+			case '0', '1':
+				id |= uint64(c - '0')
+			}
+		}
+	}
+
+	// Construct shortened line.
+	// Remove at most one space from around the marker,
+	// so that "foo [marker] bar" shortens to "foo bar".
+	j++ // skip ]
+	if i > 0 && line[i-1] == ' ' {
+		i--
+	} else if j < len(line) && line[j] == ' ' {
+		j++
+	}
+	short = line[:i] + line[j:]
+	return short, id, true
+}
+
+// Hash computes a hash of the data arguments,
+// each of which must be of type string, byte, int, uint, int32, uint32, int64, uint64, uintptr, or a slice of one of those types.
+func Hash(data ...any) uint64 {
+	h := offset64
+	for _, v := range data {
+		switch v := v.(type) {
+		default:
+			// Note: Not printing the type, because reflect.ValueOf(v)
+			// would make the interfaces prepared by the caller escape
+			// and therefore allocate. This way, Hash(file, line) runs
+			// without any allocation. It should be clear from the
+			// source code calling Hash what the bad argument was.
+			panic("bisect.Hash: unexpected argument type")
+		case string:
+			h = fnvString(h, v)
+		case byte:
+			h = fnv(h, v)
+		case int:
+			h = fnvUint64(h, uint64(v))
+		case uint:
+			h = fnvUint64(h, uint64(v))
+		case int32:
+			h = fnvUint32(h, uint32(v))
+		case uint32:
+			h = fnvUint32(h, v)
+		case int64:
+			h = fnvUint64(h, uint64(v))
+		case uint64:
+			h = fnvUint64(h, v)
+		case uintptr:
+			h = fnvUint64(h, uint64(v))
+		case []string:
+			for _, x := range v {
+				h = fnvString(h, x)
+			}
+		case []byte:
+			for _, x := range v {
+				h = fnv(h, x)
+			}
+		case []int:
+			for _, x := range v {
+				h = fnvUint64(h, uint64(x))
+			}
+		case []uint:
+			for _, x := range v {
+				h = fnvUint64(h, uint64(x))
+			}
+		case []int32:
+			for _, x := range v {
+				h = fnvUint32(h, uint32(x))
+			}
+		case []uint32:
+			for _, x := range v {
+				h = fnvUint32(h, x)
+			}
+		case []int64:
+			for _, x := range v {
+				h = fnvUint64(h, uint64(x))
+			}
+		case []uint64:
+			for _, x := range v {
+				h = fnvUint64(h, x)
+			}
+		case []uintptr:
+			for _, x := range v {
+				h = fnvUint64(h, uint64(x))
+			}
+		}
+	}
+	return h
+}
+
+// Trivial error implementation, here to avoid importing errors.
+
+// parseError is a trivial error implementation,
+// defined here to avoid importing errors.
+type parseError struct{ text string }
+
+func (e *parseError) Error() string { return e.text }
+
+// FNV-1a implementation. See Go's hash/fnv/fnv.go.
+// Copied here for simplicity (can handle integers more directly)
+// and to avoid importing hash/fnv.
+
+const (
+	offset64 uint64 = 14695981039346656037
+	prime64  uint64 = 1099511628211
+)
+
+func fnv(h uint64, x byte) uint64 {
+	h ^= uint64(x)
+	h *= prime64
+	return h
+}
+
+func fnvString(h uint64, x string) uint64 {
+	for i := 0; i < len(x); i++ {
+		h ^= uint64(x[i])
+		h *= prime64
+	}
+	return h
+}
+
+func fnvUint64(h uint64, x uint64) uint64 {
+	for i := 0; i < 8; i++ {
+		h ^= x & 0xFF
+		x >>= 8
+		h *= prime64
+	}
+	return h
+}
+
+func fnvUint32(h uint64, x uint32) uint64 {
+	for i := 0; i < 4; i++ {
+		h ^= uint64(x & 0xFF)
+		x >>= 8
+		h *= prime64
+	}
+	return h
+}
+
+// A dedup is a deduplicator for call stacks, so that we only print
+// a report for new call stacks, not for call stacks we've already
+// reported.
+//
+// It has two modes: an approximate but lock-free mode that
+// may still emit some duplicates, and a precise mode that uses
+// a lock and never emits duplicates.
+type dedup struct {
+	// 128-entry 4-way, lossy cache for seenLossy
+	recent [128][4]uint64
+
+	// complete history for seen
+	mu sync.Mutex
+	m  map[uint64]bool
+}
+
+// seen records that h has now been seen and reports whether it was seen before.
+// When seen returns false, the caller is expected to print a report for h.
+func (d *dedup) seen(h uint64) bool {
+	d.mu.Lock()
+	if d.m == nil {
+		d.m = make(map[uint64]bool)
+	}
+	seen := d.m[h]
+	d.m[h] = true
+	d.mu.Unlock()
+	return seen
+}
+
+// seenLossy is a variant of seen that avoids a lock by using a cache of recently seen hashes.
+// Each cache entry is N-way set-associative: h can appear in any of the slots.
+// If h does not appear in any of them, then it is inserted into a random slot,
+// overwriting whatever was there before.
+func (d *dedup) seenLossy(h uint64) bool {
+	cache := &d.recent[uint(h)%uint(len(d.recent))]
+	for i := 0; i < len(cache); i++ {
+		if atomic.LoadUint64(&cache[i]) == h {
+			return true
+		}
+	}
+
+	// Compute index in set to evict as hash of current set.
+	ch := offset64
+	for _, x := range cache {
+		ch = fnvUint64(ch, x)
+	}
+	atomic.StoreUint64(&cache[uint(ch)%uint(len(cache))], h)
+	return false
+}
--- a/src/internal/buildcfg/cfg.go
+++ b/src/internal/buildcfg/cfg.go
@@ -0,0 +1,414 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package buildcfg provides access to the build configuration
+// described by the current environment. It is for use by build tools
+// such as cmd/go or cmd/compile and for setting up go/build's Default context.
+//
+// Note that it does NOT provide access to the build configuration used to
+// build the currently-running binary. For that, use runtime.GOOS etc
+// as well as internal/goexperiment.
+package buildcfg
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+var (
+	GOROOT    = os.Getenv("GOROOT") // cached for efficiency
+	GOARCH    = envOr("GOARCH", defaultGOARCH)
+	GOOS      = envOr("GOOS", defaultGOOS)
+	GO386     = envOr("GO386", defaultGO386)
+	GOAMD64   = goamd64()
+	GOARM     = goarm()
+	GOARM64   = goarm64()
+	GOMIPS    = gomips()
+	GOMIPS64  = gomips64()
+	GOPPC64   = goppc64()
+	GORISCV64 = goriscv64()
+	GOWASM    = gowasm()
+	ToolTags  = toolTags()
+	GO_LDSO   = defaultGO_LDSO
+	Version   = version
+)
+
+// Error is one of the errors found (if any) in the build configuration.
+var Error error
+
+// Check exits the program with a fatal error if Error is non-nil.
+func Check() {
+	if Error != nil {
+		fmt.Fprintf(os.Stderr, "%s: %v\n", filepath.Base(os.Args[0]), Error)
+		os.Exit(2)
+	}
+}
+
+func envOr(key, value string) string {
+	if x := os.Getenv(key); x != "" {
+		return x
+	}
+	return value
+}
+
+func goamd64() int {
+	switch v := envOr("GOAMD64", defaultGOAMD64); v {
+	case "v1":
+		return 1
+	case "v2":
+		return 2
+	case "v3":
+		return 3
+	case "v4":
+		return 4
+	}
+	Error = fmt.Errorf("invalid GOAMD64: must be v1, v2, v3, v4")
+	return int(defaultGOAMD64[len("v")] - '0')
+}
+
+type goarmFeatures struct {
+	Version   int
+	SoftFloat bool
+}
+
+func (g goarmFeatures) String() string {
+	armStr := strconv.Itoa(g.Version)
+	if g.SoftFloat {
+		armStr += ",softfloat"
+	} else {
+		armStr += ",hardfloat"
+	}
+	return armStr
+}
+
+func goarm() (g goarmFeatures) {
+	const (
+		softFloatOpt = ",softfloat"
+		hardFloatOpt = ",hardfloat"
+	)
+	def := defaultGOARM
+	if GOOS == "android" && GOARCH == "arm" {
+		// Android arm devices always support GOARM=7.
+		def = "7"
+	}
+	v := envOr("GOARM", def)
+
+	floatSpecified := false
+	if strings.HasSuffix(v, softFloatOpt) {
+		g.SoftFloat = true
+		floatSpecified = true
+		v = v[:len(v)-len(softFloatOpt)]
+	}
+	if strings.HasSuffix(v, hardFloatOpt) {
+		floatSpecified = true
+		v = v[:len(v)-len(hardFloatOpt)]
+	}
+
+	switch v {
+	case "5":
+		g.Version = 5
+	case "6":
+		g.Version = 6
+	case "7":
+		g.Version = 7
+	default:
+		Error = fmt.Errorf("invalid GOARM: must start with 5, 6, or 7, and may optionally end in either %q or %q", hardFloatOpt, softFloatOpt)
+		g.Version = int(def[0] - '0')
+	}
+
+	// 5 defaults to softfloat. 6 and 7 default to hardfloat.
+	if !floatSpecified && g.Version == 5 {
+		g.SoftFloat = true
+	}
+	return
+}
+
+type Goarm64Features struct {
+	Version string
+	// Large Systems Extension
+	LSE bool
+	// ARM v8.0 Cryptographic Extension. It includes the following features:
+	// * FEAT_AES, which includes the AESD and AESE instructions.
+	// * FEAT_PMULL, which includes the PMULL, PMULL2 instructions.
+	// * FEAT_SHA1, which includes the SHA1* instructions.
+	// * FEAT_SHA256, which includes the SHA256* instructions.
+	Crypto bool
+}
+
+func (g Goarm64Features) String() string {
+	arm64Str := g.Version
+	if g.LSE {
+		arm64Str += ",lse"
+	}
+	if g.Crypto {
+		arm64Str += ",crypto"
+	}
+	return arm64Str
+}
+
+func ParseGoarm64(v string) (g Goarm64Features, e error) {
+	const (
+		lseOpt    = ",lse"
+		cryptoOpt = ",crypto"
+	)
+
+	g.LSE = false
+	g.Crypto = false
+	// We allow any combination of suffixes, in any order
+	for {
+		if strings.HasSuffix(v, lseOpt) {
+			g.LSE = true
+			v = v[:len(v)-len(lseOpt)]
+			continue
+		}
+
+		if strings.HasSuffix(v, cryptoOpt) {
+			g.Crypto = true
+			v = v[:len(v)-len(cryptoOpt)]
+			continue
+		}
+
+		break
+	}
+
+	switch v {
+	case "v8.0":
+		g.Version = v
+	case "v8.1", "v8.2", "v8.3", "v8.4", "v8.5", "v8.6", "v8.7", "v8.8", "v8.9",
+		"v9.0", "v9.1", "v9.2", "v9.3", "v9.4", "v9.5":
+		g.Version = v
+		// LSE extension is mandatory starting from 8.1
+		g.LSE = true
+	default:
+		e = fmt.Errorf("invalid GOARM64: must start with v8.{0-9} or v9.{0-5} and may optionally end in %q and/or %q",
+			lseOpt, cryptoOpt)
+		g.Version = defaultGOARM64
+	}
+
+	return
+}
+
+func goarm64() (g Goarm64Features) {
+	g, Error = ParseGoarm64(envOr("GOARM64", defaultGOARM64))
+	return
+}
+
+// Returns true if g supports giving ARM64 ISA
+// Note that this function doesn't accept / test suffixes (like ",lse" or ",crypto")
+func (g Goarm64Features) Supports(s string) bool {
+	// We only accept "v{8-9}.{0-9}. Everything else is malformed.
+	if len(s) != 4 {
+		return false
+	}
+
+	major := s[1]
+	minor := s[3]
+
+	// We only accept "v{8-9}.{0-9}. Everything else is malformed.
+	if major < '8' || major > '9' ||
+		minor < '0' || minor > '9' ||
+		s[0] != 'v' || s[2] != '.' {
+		return false
+	}
+
+	g_major := g.Version[1]
+	g_minor := g.Version[3]
+
+	if major == g_major {
+		return minor <= g_minor
+	} else if g_major == '9' {
+		// v9.0 diverged from v8.5. This means we should compare with g_minor increased by five.
+		return minor <= g_minor+5
+	} else {
+		return false
+	}
+}
+
+func gomips() string {
+	switch v := envOr("GOMIPS", defaultGOMIPS); v {
+	case "hardfloat", "softfloat":
+		return v
+	}
+	Error = fmt.Errorf("invalid GOMIPS: must be hardfloat, softfloat")
+	return defaultGOMIPS
+}
+
+func gomips64() string {
+	switch v := envOr("GOMIPS64", defaultGOMIPS64); v {
+	case "hardfloat", "softfloat":
+		return v
+	}
+	Error = fmt.Errorf("invalid GOMIPS64: must be hardfloat, softfloat")
+	return defaultGOMIPS64
+}
+
+func goppc64() int {
+	switch v := envOr("GOPPC64", defaultGOPPC64); v {
+	case "power8":
+		return 8
+	case "power9":
+		return 9
+	case "power10":
+		return 10
+	}
+	Error = fmt.Errorf("invalid GOPPC64: must be power8, power9, power10")
+	return int(defaultGOPPC64[len("power")] - '0')
+}
+
+func goriscv64() int {
+	switch v := envOr("GORISCV64", defaultGORISCV64); v {
+	case "rva20u64":
+		return 20
+	case "rva22u64":
+		return 22
+	}
+	Error = fmt.Errorf("invalid GORISCV64: must be rva20u64, rva22u64")
+	v := defaultGORISCV64[len("rva"):]
+	i := strings.IndexFunc(v, func(r rune) bool {
+		return r < '0' || r > '9'
+	})
+	year, _ := strconv.Atoi(v[:i])
+	return year
+}
+
+type gowasmFeatures struct {
+	SatConv bool
+	SignExt bool
+}
+
+func (f gowasmFeatures) String() string {
+	var flags []string
+	if f.SatConv {
+		flags = append(flags, "satconv")
+	}
+	if f.SignExt {
+		flags = append(flags, "signext")
+	}
+	return strings.Join(flags, ",")
+}
+
+func gowasm() (f gowasmFeatures) {
+	for _, opt := range strings.Split(envOr("GOWASM", ""), ",") {
+		switch opt {
+		case "satconv":
+			f.SatConv = true
+		case "signext":
+			f.SignExt = true
+		case "":
+			// ignore
+		default:
+			Error = fmt.Errorf("invalid GOWASM: no such feature %q", opt)
+		}
+	}
+	return
+}
+
+func Getgoextlinkenabled() string {
+	return envOr("GO_EXTLINK_ENABLED", defaultGO_EXTLINK_ENABLED)
+}
+
+func toolTags() []string {
+	tags := experimentTags()
+	tags = append(tags, gogoarchTags()...)
+	return tags
+}
+
+func experimentTags() []string {
+	var list []string
+	// For each experiment that has been enabled in the toolchain, define a
+	// build tag with the same name but prefixed by "goexperiment." which can be
+	// used for compiling alternative files for the experiment. This allows
+	// changes for the experiment, like extra struct fields in the runtime,
+	// without affecting the base non-experiment code at all.
+	for _, exp := range Experiment.Enabled() {
+		list = append(list, "goexperiment."+exp)
+	}
+	return list
+}
+
+// GOGOARCH returns the name and value of the GO$GOARCH setting.
+// For example, if GOARCH is "amd64" it might return "GOAMD64", "v2".
+func GOGOARCH() (name, value string) {
+	switch GOARCH {
+	case "386":
+		return "GO386", GO386
+	case "amd64":
+		return "GOAMD64", fmt.Sprintf("v%d", GOAMD64)
+	case "arm":
+		return "GOARM", GOARM.String()
+	case "arm64":
+		return "GOARM64", GOARM64.String()
+	case "mips", "mipsle":
+		return "GOMIPS", GOMIPS
+	case "mips64", "mips64le":
+		return "GOMIPS64", GOMIPS64
+	case "ppc64", "ppc64le":
+		return "GOPPC64", fmt.Sprintf("power%d", GOPPC64)
+	case "wasm":
+		return "GOWASM", GOWASM.String()
+	}
+	return "", ""
+}
+
+func gogoarchTags() []string {
+	switch GOARCH {
+	case "386":
+		return []string{GOARCH + "." + GO386}
+	case "amd64":
+		var list []string
+		for i := 1; i <= GOAMD64; i++ {
+			list = append(list, fmt.Sprintf("%s.v%d", GOARCH, i))
+		}
+		return list
+	case "arm":
+		var list []string
+		for i := 5; i <= GOARM.Version; i++ {
+			list = append(list, fmt.Sprintf("%s.%d", GOARCH, i))
+		}
+		return list
+	case "arm64":
+		var list []string
+		major := int(GOARM64.Version[1] - '0')
+		minor := int(GOARM64.Version[3] - '0')
+		for i := 0; i <= minor; i++ {
+			list = append(list, fmt.Sprintf("%s.v%d.%d", GOARCH, major, i))
+		}
+		// ARM64 v9.x also includes support of v8.x+5 (i.e. v9.1 includes v8.(1+5) = v8.6).
+		if major == 9 {
+			for i := 0; i <= minor+5 && i <= 9; i++ {
+				list = append(list, fmt.Sprintf("%s.v%d.%d", GOARCH, 8, i))
+			}
+		}
+		return list
+	case "mips", "mipsle":
+		return []string{GOARCH + "." + GOMIPS}
+	case "mips64", "mips64le":
+		return []string{GOARCH + "." + GOMIPS64}
+	case "ppc64", "ppc64le":
+		var list []string
+		for i := 8; i <= GOPPC64; i++ {
+			list = append(list, fmt.Sprintf("%s.power%d", GOARCH, i))
+		}
+		return list
+	case "riscv64":
+		list := []string{GOARCH + "." + "rva20u64"}
+		if GORISCV64 >= 22 {
+			list = append(list, GOARCH+"."+"rva22u64")
+		}
+		return list
+	case "wasm":
+		var list []string
+		if GOWASM.SatConv {
+			list = append(list, GOARCH+".satconv")
+		}
+		if GOWASM.SignExt {
+			list = append(list, GOARCH+".signext")
+		}
+		return list
+	}
+	return nil
+}
--- a/src/internal/buildcfg/cfg_test.go
+++ b/src/internal/buildcfg/cfg_test.go
@@ -0,0 +1,125 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package buildcfg
+
+import (
+	"os"
+	"testing"
+)
+
+func TestConfigFlags(t *testing.T) {
+	os.Setenv("GOAMD64", "v1")
+	if goamd64() != 1 {
+		t.Errorf("Wrong parsing of GOAMD64=v1")
+	}
+	os.Setenv("GOAMD64", "v4")
+	if goamd64() != 4 {
+		t.Errorf("Wrong parsing of GOAMD64=v4")
+	}
+	Error = nil
+	os.Setenv("GOAMD64", "1")
+	if goamd64(); Error == nil {
+		t.Errorf("Wrong parsing of GOAMD64=1")
+	}
+
+	os.Setenv("GORISCV64", "rva20u64")
+	if goriscv64() != 20 {
+		t.Errorf("Wrong parsing of RISCV64=rva20u64")
+	}
+	os.Setenv("GORISCV64", "rva22u64")
+	if goriscv64() != 22 {
+		t.Errorf("Wrong parsing of RISCV64=rva22u64")
+	}
+	Error = nil
+	os.Setenv("GORISCV64", "rva22")
+	if _ = goriscv64(); Error == nil {
+		t.Errorf("Wrong parsing of RISCV64=rva22")
+	}
+	Error = nil
+	os.Setenv("GOARM64", "v7.0")
+	if _ = goarm64(); Error == nil {
+		t.Errorf("Wrong parsing of GOARM64=7.0")
+	}
+	Error = nil
+	os.Setenv("GOARM64", "8.0")
+	if _ = goarm64(); Error == nil {
+		t.Errorf("Wrong parsing of GOARM64=8.0")
+	}
+	Error = nil
+	os.Setenv("GOARM64", "v8.0,lsb")
+	if _ = goarm64(); Error == nil {
+		t.Errorf("Wrong parsing of GOARM64=v8.0,lsb")
+	}
+	os.Setenv("GOARM64", "v8.0,lse")
+	if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != false {
+		t.Errorf("Wrong parsing of GOARM64=v8.0,lse")
+	}
+	os.Setenv("GOARM64", "v8.0,crypto")
+	if goarm64().Version != "v8.0" || goarm64().LSE != false || goarm64().Crypto != true {
+		t.Errorf("Wrong parsing of GOARM64=v8.0,crypto")
+	}
+	os.Setenv("GOARM64", "v8.0,crypto,lse")
+	if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != true {
+		t.Errorf("Wrong parsing of GOARM64=v8.0,crypto,lse")
+	}
+	os.Setenv("GOARM64", "v8.0,lse,crypto")
+	if goarm64().Version != "v8.0" || goarm64().LSE != true || goarm64().Crypto != true {
+		t.Errorf("Wrong parsing of GOARM64=v8.0,lse,crypto")
+	}
+	os.Setenv("GOARM64", "v9.0")
+	if goarm64().Version != "v9.0" || goarm64().LSE != true || goarm64().Crypto != false {
+		t.Errorf("Wrong parsing of GOARM64=v9.0")
+	}
+}
+
+func TestGoarm64FeaturesSupports(t *testing.T) {
+	g, _ := ParseGoarm64("v9.3")
+
+	if !g.Supports("v9.3") {
+		t.Errorf("Wrong goarm64Features.Supports for v9.3, v9.3")
+	}
+
+	if g.Supports("v9.4") {
+		t.Errorf("Wrong goarm64Features.Supports for v9.3, v9.4")
+	}
+
+	if !g.Supports("v8.8") {
+		t.Errorf("Wrong goarm64Features.Supports for v9.3, v8.8")
+	}
+
+	if g.Supports("v8.9") {
+		t.Errorf("Wrong goarm64Features.Supports for v9.3, v8.9")
+	}
+
+	if g.Supports(",lse") {
+		t.Errorf("Wrong goarm64Features.Supports for v9.3, ,lse")
+	}
+}
+
+func TestGogoarchTags(t *testing.T) {
+	old_goarch := GOARCH
+	old_goarm64 := GOARM64
+
+	GOARCH = "arm64"
+
+	os.Setenv("GOARM64", "v9.5")
+	GOARM64 = goarm64()
+	tags := gogoarchTags()
+	want := []string{"arm64.v9.0", "arm64.v9.1", "arm64.v9.2", "arm64.v9.3", "arm64.v9.4", "arm64.v9.5",
+		"arm64.v8.0", "arm64.v8.1", "arm64.v8.2", "arm64.v8.3", "arm64.v8.4", "arm64.v8.5", "arm64.v8.6", "arm64.v8.7", "arm64.v8.8", "arm64.v8.9"}
+	if len(tags) != len(want) {
+		t.Errorf("Wrong number of tags for GOARM64=v9.5")
+	} else {
+		for i, v := range tags {
+			if v != want[i] {
+				t.Error("Wrong tags for GOARM64=v9.5")
+				break
+			}
+		}
+	}
+
+	GOARCH = old_goarch
+	GOARM64 = old_goarm64
+}
--- a/src/internal/buildcfg/exp.go
+++ b/src/internal/buildcfg/exp.go
@@ -0,0 +1,190 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package buildcfg
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+
+	"internal/goexperiment"
+)
+
+// ExperimentFlags represents a set of GOEXPERIMENT flags relative to a baseline
+// (platform-default) experiment configuration.
+type ExperimentFlags struct {
+	goexperiment.Flags
+	baseline goexperiment.Flags
+}
+
+// Experiment contains the toolchain experiments enabled for the
+// current build.
+//
+// (This is not necessarily the set of experiments the compiler itself
+// was built with.)
+//
+// experimentBaseline specifies the experiment flags that are enabled by
+// default in the current toolchain. This is, in effect, the "control"
+// configuration and any variation from this is an experiment.
+var Experiment ExperimentFlags = func() ExperimentFlags {
+	flags, err := ParseGOEXPERIMENT(GOOS, GOARCH, envOr("GOEXPERIMENT", defaultGOEXPERIMENT))
+	if err != nil {
+		Error = err
+		return ExperimentFlags{}
+	}
+	return *flags
+}()
+
+// DefaultGOEXPERIMENT is the embedded default GOEXPERIMENT string.
+// It is not guaranteed to be canonical.
+const DefaultGOEXPERIMENT = defaultGOEXPERIMENT
+
+// FramePointerEnabled enables the use of platform conventions for
+// saving frame pointers.
+//
+// This used to be an experiment, but now it's always enabled on
+// platforms that support it.
+//
+// Note: must agree with runtime.framepointer_enabled.
+var FramePointerEnabled = GOARCH == "amd64" || GOARCH == "arm64"
+
+// ParseGOEXPERIMENT parses a (GOOS, GOARCH, GOEXPERIMENT)
+// configuration tuple and returns the enabled and baseline experiment
+// flag sets.
+//
+// TODO(mdempsky): Move to internal/goexperiment.
+func ParseGOEXPERIMENT(goos, goarch, goexp string) (*ExperimentFlags, error) {
+	// regabiSupported is set to true on platforms where register ABI is
+	// supported and enabled by default.
+	// regabiAlwaysOn is set to true on platforms where register ABI is
+	// always on.
+	var regabiSupported, regabiAlwaysOn bool
+	switch goarch {
+	case "amd64", "arm64", "loong64", "ppc64le", "ppc64", "riscv64":
+		regabiAlwaysOn = true
+		regabiSupported = true
+	}
+
+	baseline := goexperiment.Flags{
+		RegabiWrappers:   regabiSupported,
+		RegabiArgs:       regabiSupported,
+		CoverageRedesign: true,
+	}
+
+	// Start with the statically enabled set of experiments.
+	flags := &ExperimentFlags{
+		Flags:    baseline,
+		baseline: baseline,
+	}
+
+	// Pick up any changes to the baseline configuration from the
+	// GOEXPERIMENT environment. This can be set at make.bash time
+	// and overridden at build time.
+	if goexp != "" {
+		// Create a map of known experiment names.
+		names := make(map[string]func(bool))
+		rv := reflect.ValueOf(&flags.Flags).Elem()
+		rt := rv.Type()
+		for i := 0; i < rt.NumField(); i++ {
+			field := rv.Field(i)
+			names[strings.ToLower(rt.Field(i).Name)] = field.SetBool
+		}
+
+		// "regabi" is an alias for all working regabi
+		// subexperiments, and not an experiment itself. Doing
+		// this as an alias make both "regabi" and "noregabi"
+		// do the right thing.
+		names["regabi"] = func(v bool) {
+			flags.RegabiWrappers = v
+			flags.RegabiArgs = v
+		}
+
+		// Parse names.
+		for _, f := range strings.Split(goexp, ",") {
+			if f == "" {
+				continue
+			}
+			if f == "none" {
+				// GOEXPERIMENT=none disables all experiment flags.
+				// This is used by cmd/dist, which doesn't know how
+				// to build with any experiment flags.
+				flags.Flags = goexperiment.Flags{}
+				continue
+			}
+			val := true
+			if strings.HasPrefix(f, "no") {
+				f, val = f[2:], false
+			}
+			set, ok := names[f]
+			if !ok {
+				return nil, fmt.Errorf("unknown GOEXPERIMENT %s", f)
+			}
+			set(val)
+		}
+	}
+
+	if regabiAlwaysOn {
+		flags.RegabiWrappers = true
+		flags.RegabiArgs = true
+	}
+	// regabi is only supported on amd64, arm64, loong64, riscv64, ppc64 and ppc64le.
+	if !regabiSupported {
+		flags.RegabiWrappers = false
+		flags.RegabiArgs = false
+	}
+	// Check regabi dependencies.
+	if flags.RegabiArgs && !flags.RegabiWrappers {
+		return nil, fmt.Errorf("GOEXPERIMENT regabiargs requires regabiwrappers")
+	}
+	return flags, nil
+}
+
+// String returns the canonical GOEXPERIMENT string to enable this experiment
+// configuration. (Experiments in the same state as in the baseline are elided.)
+func (exp *ExperimentFlags) String() string {
+	return strings.Join(expList(&exp.Flags, &exp.baseline, false), ",")
+}
+
+// expList returns the list of lower-cased experiment names for
+// experiments that differ from base. base may be nil to indicate no
+// experiments. If all is true, then include all experiment flags,
+// regardless of base.
+func expList(exp, base *goexperiment.Flags, all bool) []string {
+	var list []string
+	rv := reflect.ValueOf(exp).Elem()
+	var rBase reflect.Value
+	if base != nil {
+		rBase = reflect.ValueOf(base).Elem()
+	}
+	rt := rv.Type()
+	for i := 0; i < rt.NumField(); i++ {
+		name := strings.ToLower(rt.Field(i).Name)
+		val := rv.Field(i).Bool()
+		baseVal := false
+		if base != nil {
+			baseVal = rBase.Field(i).Bool()
+		}
+		if all || val != baseVal {
+			if val {
+				list = append(list, name)
+			} else {
+				list = append(list, "no"+name)
+			}
+		}
+	}
+	return list
+}
+
+// Enabled returns a list of enabled experiments, as
+// lower-cased experiment names.
+func (exp *ExperimentFlags) Enabled() []string {
+	return expList(&exp.Flags, nil, false)
+}
+
+// All returns a list of all experiment settings.
+// Disabled experiments appear in the list prefixed by "no".
+func (exp *ExperimentFlags) All() []string {
+	return expList(&exp.Flags, nil, true)
+}
--- a/src/internal/bytealg/bytealg.go
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,118 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	offsetX86HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+
+	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+
+	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+// If MaxLen is not 0, make sure MaxLen >= 4.
+var MaxLen int
+
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
+// HashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStr[T string | []byte](sep T) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRev returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRev[T string | []byte](sep T) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// IndexRabinKarp uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of sep in s, or -1 if not present.
+func IndexRabinKarp[T string | []byte](s, sep T) int {
+	// Rabin-Karp search
+	hashss, pow := HashStr(sep)
+	n := len(sep)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashss && string(s[:n]) == string(sep) {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashss && string(s[i-n:i]) == string(sep) {
+			return i - n
+		}
+	}
+	return -1
+}
+
+// LastIndexRabinKarp uses the Rabin-Karp search algorithm to return the last index of the
+// occurrence of sep in s, or -1 if not present.
+func LastIndexRabinKarp[T string | []byte](s, sep T) int {
+	// Rabin-Karp search from the end of the string
+	hashss, pow := HashStrRev(sep)
+	n := len(sep)
+	last := len(s) - n
+	var h uint32
+	for i := len(s) - 1; i >= last; i-- {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashss && string(s[last:]) == string(sep) {
+		return last
+	}
+	for i := last - 1; i >= 0; i-- {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i+n])
+		if h == hashss && string(s[i:i+n]) == string(sep) {
+			return i
+		}
+	}
+	return -1
+}
+
+// MakeNoZero makes a slice of length n and capacity of at least n Bytes
+// without zeroing the bytes (including the bytes between len and cap).
+// It is the caller's responsibility to ensure uninitialized bytes
+// do not leak to the end user.
+func MakeNoZero(n int) []byte
--- a/src/internal/bytealg/compare_386.s
+++ b/src/internal/bytealg/compare_386.s
@@ -0,0 +1,144 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+12(FP), DI
+	MOVL	b_len+16(FP), DX
+	LEAL	ret+24(FP), AX
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+8(FP), DI
+	MOVL	b_len+12(FP), DX
+	LEAL	ret+16(FP), AX
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+//   AX = address of return word (set to 1/0/-1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	MOVL	DX, BP
+	SUBL	BX, DX // DX = blen-alen
+	JLE	2(PC)
+	MOVL	BX, BP // BP = min(alen, blen)
+	CMPL	SI, DI
+	JEQ	allsame
+	CMPL	BP, $4
+	JB	small
+#ifdef GO386_softfloat
+	JMP	mediumloop
+#endif
+largeloop:
+	CMPL	BP, $16
+	JB	mediumloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, BX
+	XORL	$0xffff, BX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDL	$16, SI
+	ADDL	$16, DI
+	SUBL	$16, BP
+	JMP	largeloop
+
+diff16:
+	BSFL	BX, BX	// index of first byte that differs
+	XORL	DX, DX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	DX
+	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
+	MOVL	DX, (AX)
+	RET
+
+mediumloop:
+	CMPL	BP, $4
+	JBE	_0through4
+	MOVL	(SI), BX
+	MOVL	(DI), CX
+	CMPL	BX, CX
+	JNE	diff4
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BP
+	JMP	mediumloop
+
+_0through4:
+	MOVL	-4(SI)(BP*1), BX
+	MOVL	-4(DI)(BP*1), CX
+	CMPL	BX, CX
+	JEQ	allsame
+
+diff4:
+	BSWAPL	BX	// reverse order of bytes
+	BSWAPL	CX
+	XORL	BX, CX	// find bit differences
+	BSRL	CX, CX	// index of highest bit difference
+	SHRL	CX, BX	// move a's bit to bottom
+	ANDL	$1, BX	// mask bit
+	LEAL	-1(BX*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// 0-3 bytes in common
+small:
+	LEAL	(BP*8), CX
+	NEGL	CX
+	JEQ	allsame
+
+	// load si
+	CMPB	SI, $0xfc
+	JA	si_high
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVL	-4(SI)(BP*1), SI
+	SHRL	CX, SI
+si_finish:
+	SHLL	CX, SI
+
+	// same for di
+	CMPB	DI, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BP*1), DI
+	SHRL	CX, DI
+di_finish:
+	SHLL	CX, DI
+
+	BSWAPL	SI	// reverse order of bytes
+	BSWAPL	DI
+	XORL	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRL	DI, CX	// index of highest bit difference
+	SHRL	CX, SI	// move a's bit to bottom
+	ANDL	$1, SI	// mask bit
+	LEAL	-1(SI*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// all the bytes in common are the same, so we just need
+	// to compare the lengths.
+allsame:
+	XORL	BX, BX
+	XORL	CX, CX
+	TESTL	DX, DX
+	SETLT	BX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
+	MOVL	BX, (AX)
+	RET
--- a/src/internal/bytealg/compare_amd64.s
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,237 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
+	// AX = a_base (want in SI)
+	// BX = a_len  (want in BX)
+	// CX = a_cap  (unused)
+	// DI = b_base (want in DI)
+	// SI = b_len  (want in DX)
+	// R8 = b_cap  (unused)
+	MOVQ	SI, DX
+	MOVQ	AX, SI
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
+	// AX = a_base (want in SI)
+	// BX = a_len  (want in BX)
+	// CX = b_base (want in DI)
+	// DI = b_len  (want in DX)
+	MOVQ	AX, SI
+	MOVQ	DI, DX
+	MOVQ	CX, DI
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+// output:
+//   AX = output (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	CMPQ	SI, DI
+	JEQ	allsame
+	CMPQ	BX, DX
+	MOVQ	DX, R8
+	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+	CMPQ	R8, $8
+	JB	small
+
+	CMPQ	R8, $63
+	JBE	loop
+#ifndef hasAVX2
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JEQ     big_loop_avx2
+	JMP	big_loop
+#else
+	JMP	big_loop_avx2
+#endif
+loop:
+	CMPQ	R8, $16
+	JBE	_0through16
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	SUBQ	$16, R8
+	JMP	loop
+
+diff64:
+	ADDQ	$48, SI
+	ADDQ	$48, DI
+	JMP	diff16
+diff48:
+	ADDQ	$32, SI
+	ADDQ	$32, DI
+	JMP	diff16
+diff32:
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	// AX = bit mask of differences
+diff16:
+	BSFQ	AX, BX	// index of first byte that differs
+	XORQ	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
+	RET
+
+	// 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+	CMPQ	R8, $8
+	JBE	_0through8
+	MOVQ	(SI), AX
+	MOVQ	(DI), CX
+	CMPQ	AX, CX
+	JNE	diff8
+_0through8:
+	MOVQ	-8(SI)(R8*1), AX
+	MOVQ	-8(DI)(R8*1), CX
+	CMPQ	AX, CX
+	JEQ	allsame
+
+	// AX and CX contain parts of a and b that differ.
+diff8:
+	BSWAPQ	AX	// reverse order of bytes
+	BSWAPQ	CX
+	XORQ	AX, CX
+	BSRQ	CX, CX	// index of highest bit difference
+	SHRQ	CX, AX	// move a's bit to bottom
+	ANDQ	$1, AX	// mask bit
+	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
+	RET
+
+	// 0-7 bytes in common
+small:
+	LEAQ	(R8*8), CX	// bytes left -> bits left
+	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
+	JEQ	allsame
+
+	// load bytes of a into high bytes of AX
+	CMPB	SI, $0xf8
+	JA	si_high
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVQ	-8(SI)(R8*1), SI
+	SHRQ	CX, SI
+si_finish:
+	SHLQ	CX, SI
+
+	// load bytes of b in to high bytes of BX
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(R8*1), DI
+	SHRQ	CX, DI
+di_finish:
+	SHLQ	CX, DI
+
+	BSWAPQ	SI	// reverse order of bytes
+	BSWAPQ	DI
+	XORQ	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRQ	DI, CX	// index of highest bit difference
+	SHRQ	CX, SI	// move a's bit to bottom
+	ANDQ	$1, SI	// mask bit
+	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
+	RET
+
+allsame:
+	XORQ	AX, AX
+	XORQ	CX, CX
+	CMPQ	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
+	RET
+
+	// this works for >= 64 bytes of data.
+#ifndef hasAVX2
+big_loop:
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff16
+
+	MOVOU	16(SI), X0
+	MOVOU	16(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff32
+
+	MOVOU	32(SI), X0
+	MOVOU	32(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff48
+
+	MOVOU	48(SI), X0
+	MOVOU	48(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff64
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JBE	loop
+	JMP	big_loop
+#endif
+
+	// Compare 64-bytes per loop iteration.
+	// Loop is unrolled and uses AVX2.
+big_loop_avx2:
+	VMOVDQU	(SI), Y2
+	VMOVDQU	(DI), Y3
+	VMOVDQU	32(SI), Y4
+	VMOVDQU	32(DI), Y5
+	VPCMPEQB Y2, Y3, Y0
+	VPMOVMSKB Y0, AX
+	XORL	$0xffffffff, AX
+	JNE	diff32_avx2
+	VPCMPEQB Y4, Y5, Y6
+	VPMOVMSKB Y6, AX
+	XORL	$0xffffffff, AX
+	JNE	diff64_avx2
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JB	big_loop_avx2_exit
+	JMP	big_loop_avx2
+
+	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+	VZEROUPPER
+	JMP diff16
+
+	// Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+	VZEROUPPER
+	JMP diff48
+
+	// For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+	VZEROUPPER
+	JMP loop
--- a/src/internal/bytealg/compare_arm.s
+++ b/src/internal/bytealg/compare_arm.s
@@ -0,0 +1,86 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+12(FP), R3
+	MOVW	b_len+16(FP), R1
+	ADD	$28, R13, R7
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+8(FP), R3
+	MOVW	b_len+12(FP), R1
+	ADD	$20, R13, R7
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6 and R8 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R2, R3
+	BEQ	samebytes
+	CMP 	R0, R1
+	MOVW 	R0, R6
+	MOVW.LT	R1, R6		// R6 is min(R0, R1)
+
+	CMP	$0, R6
+	BEQ	samebytes
+	CMP	$4, R6
+	ADD	R2, R6		// R2 is current byte in a, R6 is the end of the range to compare
+	BLT	byte_loop	// length < 4
+	AND	$3, R2, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+aligned_a:
+	AND	$3, R3, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R6, R8
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R2), R4
+	MOVW.P	4(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R2, R8
+	BNE	chunk4_loop
+	CMP	R2, R6
+	BEQ	samebytes	// all compared bytes were the same; compare lengths
+byte_loop:
+	MOVBU.P	1(R2), R4
+	MOVBU.P	1(R3), R5
+	CMP	R4, R5
+	BNE	ret
+	CMP	R2, R6
+	BNE	byte_loop
+samebytes:
+	CMP	R0, R1
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW.EQ	$0, R0
+	MOVW	R0, (R7)
+	RET
+ret:
+	// bytes differed
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW	R0, (R7)
+	RET
+cmp:
+	SUB	$4, R2, R2
+	SUB	$4, R3, R3
+	B	byte_loop
--- a/src/internal/bytealg/compare_arm64.s
+++ b/src/internal/bytealg/compare_arm64.s
@@ -0,0 +1,125 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// R0 = a_base (want in R0)
+	// R1 = a_len  (want in R1)
+	// R2 = a_cap  (unused)
+	// R3 = b_base (want in R2)
+	// R4 = b_len  (want in R3)
+	// R5 = b_cap  (unused)
+	MOVD	R3, R2
+	MOVD	R4, R3
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R0 = a_base
+	// R1 = a_len
+	// R2 = b_base
+	// R3 = b_len
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 points to the start of a
+// R1 is the length of a
+// R2 points to the start of b
+// R3 is the length of b
+//
+// On exit:
+// R0 is the result
+// R4, R5, R6, R8, R9 and R10 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R0, R2
+	BEQ	samebytes         // same starting pointers; compare lengths
+	CMP	R1, R3
+	CSEL	LT, R3, R1, R6    // R6 is min(R1, R3)
+
+	CBZ	R6, samebytes
+	BIC	$0xf, R6, R10
+	CBZ	R10, small        // length < 16
+	ADD	R0, R10           // end of chunk16
+	// length >= 16
+chunk16_loop:
+	LDP.P	16(R0), (R4, R8)
+	LDP.P	16(R2), (R5, R9)
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R8, R9
+	BNE	cmpnext
+	CMP	R10, R0
+	BNE	chunk16_loop
+	AND	$0xf, R6, R6
+	CBZ	R6, samebytes
+	SUBS	$8, R6
+	BLT	tail
+	// the length of tail > 8 bytes
+	MOVD.P	8(R0), R4
+	MOVD.P	8(R2), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUB	$8, R6
+	// compare last 8 bytes
+tail:
+	MOVD	(R0)(R6), R4
+	MOVD	(R2)(R6), R5
+	CMP	R4, R5
+	BEQ	samebytes
+cmp:
+	REV	R4, R4
+	REV	R5, R5
+	CMP	R4, R5
+ret:
+	MOVD	$1, R0
+	CNEG	HI, R0, R0
+	RET
+small:
+	TBZ	$3, R6, lt_8
+	MOVD	(R0), R4
+	MOVD	(R2), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUBS	$8, R6
+	BEQ	samebytes
+	ADD	$8, R0
+	ADD	$8, R2
+	SUB	$8, R6
+	B	tail
+lt_8:
+	TBZ	$2, R6, lt_4
+	MOVWU	(R0), R4
+	MOVWU	(R2), R5
+	CMPW	R4, R5
+	BNE	cmp
+	SUBS	$4, R6
+	BEQ	samebytes
+	ADD	$4, R0
+	ADD	$4, R2
+lt_4:
+	TBZ	$1, R6, lt_2
+	MOVHU	(R0), R4
+	MOVHU	(R2), R5
+	CMPW	R4, R5
+	BNE	cmp
+	ADD	$2, R0
+	ADD	$2, R2
+lt_2:
+	TBZ	$0, R6, samebytes
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMPW	R4, R5
+	BNE	ret
+samebytes:
+	CMP	R3, R1
+	CSET	NE, R0
+	CNEG	LO, R0, R0
+	RET
+cmpnext:
+	REV	R8, R4
+	REV	R9, R5
+	CMP	R4, R5
+	B	ret
--- a/src/internal/bytealg/compare_generic.go
+++ b/src/internal/bytealg/compare_generic.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
+
+package bytealg
+
+import _ "unsafe" // for go:linkname
+
+func Compare(a, b []byte) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	if l == 0 || &a[0] == &b[0] {
+		goto samebytes
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+samebytes:
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
+
+func CompareString(a, b string) int {
+	return runtime_cmpstring(a, b)
+}
+
+// runtime.cmpstring calls are emitted by the compiler.
+//
+// runtime.cmpstring should be an internal detail,
+// but widely used packages access it using linkname.
+// Notable members of the hall of shame include:
+//   - gitee.com/zhaochuninhefei/gmgo
+//   - github.com/bytedance/gopkg
+//   - github.com/songzhibin97/gkit
+//
+// Do not remove or change the type signature.
+// See go.dev/issue/67401.
+//
+//go:linkname runtime_cmpstring runtime.cmpstring
+func runtime_cmpstring(a, b string) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
--- a/src/internal/bytealg/compare_loong64.s
+++ b/src/internal/bytealg/compare_loong64.s
@@ -0,0 +1,88 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
+	// R4 = a_base
+	// R5 = a_len
+	// R6 = a_cap (unused)
+	// R7 = b_base (want in R6)
+	// R8 = b_len (want in R7)
+	// R9 = b_cap (unused)
+	MOVV	R7, R6
+	MOVV	R8, R7
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
+	// R4 = a_base
+	// R5 = a_len
+	// R6 = b_base
+	// R7 = b_len
+	JMP	cmpbody<>(SB)
+
+// On entry:
+// R5 length of a
+// R7 length of b
+// R4 points to the start of a
+// R6 points to the start of b
+// R13 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	R4, R6, samebytes // same start of a and b
+
+	SGTU	R5, R7, R9
+	BNE	R0, R9, r2_lt_r1
+	MOVV	R5, R14
+	JMP	entry
+r2_lt_r1:
+	MOVV	R7, R14	// R14 is min(R4, R5)
+entry:
+	ADDV	R4, R14, R12	// R6 start of a, R14 end of a
+	BEQ	R4, R12, samebytes // length is 0
+
+	SRLV	$4, R14		// R14 is number of chunks
+	BEQ	R0, R14, byte_loop
+
+	// make sure both a and b are aligned.
+	OR	R4, R6, R15
+	AND	$7, R15
+	BNE	R0, R15, byte_loop
+
+	PCALIGN	$16
+chunk16_loop:
+	BEQ	R0, R14, byte_loop
+	MOVV	(R4), R8
+	MOVV	(R6), R9
+	BNE	R8, R9, byte_loop
+	MOVV	8(R4), R16
+	MOVV	8(R6), R17
+	ADDV	$16, R4
+	ADDV	$16, R6
+	SUBVU	$1, R14
+	BEQ	R16, R17, chunk16_loop
+	SUBV	$8, R4
+	SUBV	$8, R6
+
+byte_loop:
+	BEQ	R4, R12, samebytes
+	MOVBU	(R4), R8
+	ADDVU	$1, R4
+	MOVBU	(R6), R9
+	ADDVU	$1, R6
+	BEQ	R8, R9, byte_loop
+
+byte_cmp:
+	SGTU	R8, R9, R4 // R12 = 1 if (R8 > R9)
+	BNE	R0, R4, ret
+	MOVV	$-1, R4
+	JMP	ret
+
+samebytes:
+	SGTU	R5, R7, R8
+	SGTU	R7, R5, R9
+	SUBV	R9, R8, R4
+
+ret:
+	RET
--- a/src/internal/bytealg/compare_mips64x.s
+++ b/src/internal/bytealg/compare_mips64x.s
@@ -0,0 +1,88 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+24(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+32(FP), R2
+	MOVV	$ret+48(FP), R9
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+16(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+24(FP), R2
+	MOVV	$ret+32(FP), R9
+	JMP	cmpbody<>(SB)
+
+// On entry:
+// R1 length of a
+// R2 length of b
+// R3 points to the start of a
+// R4 points to the start of b
+// R9 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	R3, R4, samebytes // same start of a and b
+
+	SGTU	R1, R2, R7
+	BNE	R0, R7, r2_lt_r1
+	MOVV	R1, R10
+	JMP	entry
+r2_lt_r1:
+	MOVV	R2, R10	// R10 is min(R1, R2)
+entry:
+	ADDV	R3, R10, R8	// R3 start of a, R8 end of a
+	BEQ	R3, R8, samebytes // length is 0
+
+	SRLV	$4, R10		// R10 is number of chunks
+	BEQ	R0, R10, byte_loop
+
+	// make sure both a and b are aligned.
+	OR	R3, R4, R11
+	AND	$7, R11
+	BNE	R0, R11, byte_loop
+
+chunk16_loop:
+	BEQ	R0, R10, byte_loop
+	MOVV	(R3), R6
+	MOVV	(R4), R7
+	BNE	R6, R7, byte_loop
+	MOVV	8(R3), R13
+	MOVV	8(R4), R14
+	ADDV	$16, R3
+	ADDV	$16, R4
+	SUBVU	$1, R10
+	BEQ	R13, R14, chunk16_loop
+	SUBV	$8, R3
+	SUBV	$8, R4
+
+byte_loop:
+	BEQ	R3, R8, samebytes
+	MOVBU	(R3), R6
+	ADDVU	$1, R3
+	MOVBU	(R4), R7
+	ADDVU	$1, R4
+	BEQ	R6, R7, byte_loop
+
+byte_cmp:
+	SGTU	R6, R7, R8 // R8 = 1 if (R6 > R7)
+	BNE	R0, R8, ret
+	MOVV	$-1, R8
+	JMP	ret
+
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBV	R7, R6, R8
+
+ret:
+	MOVV	R8, (R9)
+	RET
--- a/src/internal/bytealg/compare_mipsx.s
+++ b/src/internal/bytealg/compare_mipsx.s
@@ -0,0 +1,72 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVW	a_base+0(FP), R3
+	MOVW	b_base+12(FP), R4
+	MOVW	a_len+4(FP), R1
+	MOVW	b_len+16(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+24(FP)
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVW	a_base+0(FP), R3
+	MOVW	a_len+4(FP), R1
+	MOVW	b_base+8(FP), R4
+	MOVW	b_len+12(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes	// all compared bytes were the same; compare lengths
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+	// bytes differed
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+16(FP)
+	RET
--- a/src/internal/bytealg/compare_native.go
+++ b/src/internal/bytealg/compare_native.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
+
+package bytealg
+
+import _ "unsafe" // For go:linkname
+
+//go:noescape
+func Compare(a, b []byte) int
+
+func CompareString(a, b string) int {
+	return abigen_runtime_cmpstring(a, b)
+}
+
+// The declaration below generates ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+//go:linkname abigen_runtime_cmpstring runtime.cmpstring
+func abigen_runtime_cmpstring(a, b string) int
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -0,0 +1,342 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Helper names for x-form loads in BE ordering.
+#ifdef  GOARCH_ppc64le
+#define _LDBEX	MOVDBR
+#define _LWBEX	MOVWBR
+#define _LHBEX	MOVHBR
+#else
+#define _LDBEX	MOVD
+#define _LWBEX	MOVW
+#define _LHBEX	MOVH
+#endif
+
+#ifdef GOPPC64_power9
+#define SETB_CR0(rout) SETB CR0, rout
+#define SETB_CR1(rout) SETB CR1, rout
+#define SETB_INIT()
+#define SETB_CR0_NE(rout) SETB_CR0(rout)
+#else
+// A helper macro to emulate SETB on P8. This assumes
+// -1 is in R20, and 1 is in R21. crxlt and crxeq must
+// also be the same CR field.
+#define _SETB(crxlt, crxeq, rout) \
+	ISEL	crxeq,R0,R21,rout \
+	ISEL	crxlt,R20,rout,rout
+
+// A special case when it is know the comparison
+// will always be not equal. The result must be -1 or 1.
+#define SETB_CR0_NE(rout) \
+	ISEL	CR0LT,R20,R21,rout
+
+#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
+#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
+#define SETB_INIT() \
+	MOVD	$-1,R20 \
+	MOVD	$1,R21
+#endif
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// incoming:
+	// R3 a addr
+	// R4 a len
+	// R6 b addr
+	// R7 b len
+	//
+	// on entry to cmpbody:
+	// R3 return value if len(a) == len(b)
+	// R5 a addr
+	// R6 b addr
+	// R9 min(len(a),len(b))
+	SETB_INIT()
+	MOVD	R3,R5
+	CMP	R4,R7,CR0
+	CMP	R3,R6,CR7
+	ISEL	CR0LT,R4,R7,R9
+	SETB_CR0(R3)
+	BC	$12,30,LR	// beqlr cr7
+	BR	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// incoming:
+	// R3 a addr -> R5
+	// R4 a len  -> R3
+	// R5 b addr -> R6
+	// R6 b len  -> R4
+	//
+	// on entry to cmpbody:
+	// R3 compare value if compared length is same.
+	// R5 a addr
+	// R6 b addr
+	// R9 min(len(a),len(b))
+	SETB_INIT()
+	CMP	R4,R6,CR0
+	CMP	R3,R5,CR7
+	ISEL	CR0LT,R4,R6,R9
+	MOVD	R5,R6
+	MOVD	R3,R5
+	SETB_CR0(R3)
+	BC	$12,30,LR	// beqlr cr7
+	BR	cmpbody<>(SB)
+
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+start:
+	CMP	R9,$16,CR0
+	CMP	R9,$32,CR1
+	CMP	R9,$64,CR2
+	MOVD	$16,R10
+	BLT	cmp8
+	BLT	CR1,cmp16
+	BLT	CR2,cmp32
+
+cmp64:	// >= 64B
+	DCBT	(R5)		// optimize for size>=64
+	DCBT	(R6)		// cache hint
+
+	SRD	$6,R9,R14	// There is at least one iteration.
+	MOVD	R14,CTR
+	ANDCC   $63,R9,R9
+	CMP	R9,$16,CR1	// Do setup for tail check early on.
+	CMP	R9,$32,CR2
+	CMP	R9,$48,CR3
+	ADD	$-16,R9,R9
+
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+	PCALIGN	$16
+cmp64_loop:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different	// jump out if its different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
+	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
+	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	BDNZ	cmp64_loop
+	BC	$12,2,LR	// beqlr
+
+	// Finish out tail with minimal overlapped checking.
+	// Note, 0 tail is handled by beqlr above.
+	BLE	CR1,cmp64_tail_gt0
+	BLE	CR2,cmp64_tail_gt16
+	BLE	CR3,cmp64_tail_gt32
+
+cmp64_tail_gt48: // 49 - 63 B
+	LXVD2X	(R0)(R5),V3
+	LXVD2X	(R0)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R10),V3
+	LXVD2X	(R6)(R10),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R11),V3
+	LXVD2X	(R6)(R11),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	BR cmp64_tail_gt0
+
+	PCALIGN $16
+cmp64_tail_gt32: // 33 - 48B
+	LXVD2X	(R0)(R5),V3
+	LXVD2X	(R0)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R10),V3
+	LXVD2X	(R6)(R10),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	BR cmp64_tail_gt0
+
+	PCALIGN $16
+cmp64_tail_gt16: // 17 - 32B
+	LXVD2X	(R0)(R5),V3
+	LXVD2X	(R0)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	BR cmp64_tail_gt0
+
+	PCALIGN $16
+cmp64_tail_gt0: // 1 - 16B
+	LXVD2X	(R5)(R9),V3
+	LXVD2X	(R6)(R9),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	RET
+
+	PCALIGN $16
+cmp32:	// 32 - 63B
+	ANDCC	$31,R9,R9
+
+	LXVD2X	(R0)(R5),V3
+	LXVD2X	(R0)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R10)(R5),V3
+	LXVD2X	(R10)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	BC	$12,2,LR	// beqlr
+	ADD	R9,R10,R10
+
+	LXVD2X	(R9)(R5),V3
+	LXVD2X	(R9)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R10)(R5),V3
+	LXVD2X	(R10)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+	RET
+
+	PCALIGN $16
+cmp16:	// 16 - 31B
+	ANDCC	$15,R9,R9
+	LXVD2X	(R0)(R5),V3
+	LXVD2X	(R0)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+	BC	$12,2,LR	// beqlr
+
+	LXVD2X	(R9)(R5),V3
+	LXVD2X	(R9)(R6),V4
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+	RET
+
+	PCALIGN $16
+different:
+#ifdef	GOARCH_ppc64le
+	MOVD	$byteswap<>+00(SB),R16
+	LXVD2X	(R16)(R0),SWAP	// Set up swap string
+
+	VPERM	V3,V3,SWAP,V3
+	VPERM	V4,V4,SWAP,V4
+#endif
+
+	MFVSRD	VS35,R16	// move upper doublewords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	SETB_CR0_NE(R3)
+	RET
+
+	PCALIGN $16
+lower:
+	VSLDOI	$8,V3,V3,V3	// move lower doublewords of A and B into GPR for comparison
+	MFVSRD	VS35,R16
+	VSLDOI	$8,V4,V4,V4
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	SETB_CR0_NE(R3)
+	RET
+
+	PCALIGN $16
+cmp8:	// 8 - 15B (0 - 15B if GOPPC64_power10)
+#ifdef GOPPC64_power10
+	SLD	$56,R9,R9
+	LXVLL	R5,R9,V3	// Load bytes starting from MSB to LSB, unused are zero filled.
+	LXVLL	R6,R9,V4
+	VCMPUQ	V3,V4,CR0	// Compare as a 128b integer.
+	SETB_CR0(R6)
+	ISEL	CR0EQ,R3,R6,R3	// If equal, length determines the return value.
+	RET
+#else
+	CMP	R9,$8
+	BLT	cmp4
+	ANDCC	$7,R9,R9
+	_LDBEX	(R0)(R5),R10
+	_LDBEX	(R0)(R6),R11
+	_LDBEX	(R9)(R5),R12
+	_LDBEX	(R9)(R6),R14
+	CMPU	R10,R11,CR0
+	SETB_CR0(R5)
+	CMPU	R12,R14,CR1
+	SETB_CR1(R6)
+	CRAND   CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
+	ISEL	CR0EQ,R6,R5,R4
+	ISEL	CR1EQ,R3,R4,R3
+	RET
+
+	PCALIGN	$16
+cmp4:	// 4 - 7B
+	CMP	R9,$4
+	BLT	cmp2
+	ANDCC	$3,R9,R9
+	_LWBEX	(R0)(R5),R10
+	_LWBEX	(R0)(R6),R11
+	_LWBEX	(R9)(R5),R12
+	_LWBEX	(R9)(R6),R14
+	RLDIMI	$32,R10,$0,R12
+	RLDIMI	$32,R11,$0,R14
+	CMPU	R12,R14
+	BR	cmp0
+
+	PCALIGN $16
+cmp2:	// 2 - 3B
+	CMP	R9,$2
+	BLT	cmp1
+	ANDCC	$1,R9,R9
+	_LHBEX	(R0)(R5),R10
+	_LHBEX	(R0)(R6),R11
+	_LHBEX	(R9)(R5),R12
+	_LHBEX	(R9)(R6),R14
+	RLDIMI	$32,R10,$0,R12
+	RLDIMI	$32,R11,$0,R14
+	CMPU	R12,R14
+	BR	cmp0
+
+	PCALIGN $16
+cmp1:
+	CMP	R9,$0
+	BEQ	cmp0
+	MOVBZ	(R5),R10
+	MOVBZ	(R6),R11
+	CMPU	R10,R11
+cmp0:
+	SETB_CR0(R6)
+	ISEL	CR0EQ,R3,R6,R3
+	RET
+#endif
--- a/src/internal/bytealg/compare_riscv64.s
+++ b/src/internal/bytealg/compare_riscv64.s
@@ -0,0 +1,222 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// X10 = a_base
+	// X11 = a_len
+	// X12 = a_cap (unused)
+	// X13 = b_base (want in X12)
+	// X14 = b_len (want in X13)
+	// X15 = b_cap (unused)
+	MOV	X13, X12
+	MOV	X14, X13
+	JMP	compare<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// X10 = a_base
+	// X11 = a_len
+	// X12 = b_base
+	// X13 = b_len
+	JMP	compare<>(SB)
+
+// On entry:
+// X10 points to start of a
+// X11 length of a
+// X12 points to start of b
+// X13 length of b
+// for non-regabi X14 points to the address to store the return value (-1/0/1)
+// for regabi the return value in X10
+TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	X10, X12, cmp_len
+
+	MOV	X11, X5
+	BGE	X13, X5, use_a_len // X5 = min(len(a), len(b))
+	MOV	X13, X5
+use_a_len:
+	BEQZ	X5, cmp_len
+
+	MOV	$32, X6
+	BLT	X5, X6, check8_unaligned
+
+	// Check alignment - if alignment differs we have to do one byte at a time.
+	AND	$7, X10, X7
+	AND	$7, X12, X8
+	BNE	X7, X8, check8_unaligned
+	BEQZ	X7, compare32
+
+	// Check one byte at a time until we reach 8 byte alignment.
+	SUB	X7, X0, X7
+	ADD	$8, X7, X7
+	SUB	X7, X5, X5
+align:
+	SUB	$1, X7
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	BNE	X8, X9, cmp
+	ADD	$1, X10
+	ADD	$1, X12
+	BNEZ	X7, align
+
+check32:
+	// X6 contains $32
+	BLT	X5, X6, compare16
+compare32:
+	MOV	0(X10), X15
+	MOV	0(X12), X16
+	MOV	8(X10), X17
+	MOV	8(X12), X18
+	BNE	X15, X16, cmp8a
+	BNE	X17, X18, cmp8b
+	MOV	16(X10), X15
+	MOV	16(X12), X16
+	MOV	24(X10), X17
+	MOV	24(X12), X18
+	BNE	X15, X16, cmp8a
+	BNE	X17, X18, cmp8b
+	ADD	$32, X10
+	ADD	$32, X12
+	SUB	$32, X5
+	BGE	X5, X6, compare32
+	BEQZ	X5, cmp_len
+
+check16:
+	MOV	$16, X6
+	BLT	X5, X6, check8_unaligned
+compare16:
+	MOV	0(X10), X15
+	MOV	0(X12), X16
+	MOV	8(X10), X17
+	MOV	8(X12), X18
+	BNE	X15, X16, cmp8a
+	BNE	X17, X18, cmp8b
+	ADD	$16, X10
+	ADD	$16, X12
+	SUB	$16, X5
+	BEQZ	X5, cmp_len
+
+check8_unaligned:
+	MOV	$8, X6
+	BLT	X5, X6, check4_unaligned
+compare8_unaligned:
+	MOVBU	0(X10), X8
+	MOVBU	1(X10), X15
+	MOVBU	2(X10), X17
+	MOVBU	3(X10), X19
+	MOVBU	4(X10), X21
+	MOVBU	5(X10), X23
+	MOVBU	6(X10), X25
+	MOVBU	7(X10), X29
+	MOVBU	0(X12), X9
+	MOVBU	1(X12), X16
+	MOVBU	2(X12), X18
+	MOVBU	3(X12), X20
+	MOVBU	4(X12), X22
+	MOVBU	5(X12), X24
+	MOVBU	6(X12), X28
+	MOVBU	7(X12), X30
+	BNE	X8, X9, cmp1a
+	BNE	X15, X16, cmp1b
+	BNE	X17, X18, cmp1c
+	BNE	X19, X20, cmp1d
+	BNE	X21, X22, cmp1e
+	BNE	X23, X24, cmp1f
+	BNE	X25, X28, cmp1g
+	BNE	X29, X30, cmp1h
+	ADD	$8, X10
+	ADD	$8, X12
+	SUB	$8, X5
+	BGE	X5, X6, compare8_unaligned
+	BEQZ	X5, cmp_len
+
+check4_unaligned:
+	MOV	$4, X6
+	BLT	X5, X6, compare1
+compare4_unaligned:
+	MOVBU	0(X10), X8
+	MOVBU	1(X10), X15
+	MOVBU	2(X10), X17
+	MOVBU	3(X10), X19
+	MOVBU	0(X12), X9
+	MOVBU	1(X12), X16
+	MOVBU	2(X12), X18
+	MOVBU	3(X12), X20
+	BNE	X8, X9, cmp1a
+	BNE	X15, X16, cmp1b
+	BNE	X17, X18, cmp1c
+	BNE	X19, X20, cmp1d
+	ADD	$4, X10
+	ADD	$4, X12
+	SUB	$4, X5
+	BGE	X5, X6, compare4_unaligned
+
+compare1:
+	BEQZ	X5, cmp_len
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	BNE	X8, X9, cmp
+	ADD	$1, X10
+	ADD	$1, X12
+	SUB	$1, X5
+	JMP	compare1
+
+	// Compare 8 bytes of memory in X15/X16 that are known to differ.
+cmp8a:
+	MOV	X15, X17
+	MOV	X16, X18
+
+	// Compare 8 bytes of memory in X17/X18 that are known to differ.
+cmp8b:
+	MOV	$0xff, X19
+cmp8_loop:
+	AND	X17, X19, X8
+	AND	X18, X19, X9
+	BNE	X8, X9, cmp
+	SLLI	$8, X19
+	JMP	cmp8_loop
+
+cmp1a:
+	SLTU	X9, X8, X5
+	SLTU	X8, X9, X6
+	JMP	cmp_ret
+cmp1b:
+	SLTU	X16, X15, X5
+	SLTU	X15, X16, X6
+	JMP	cmp_ret
+cmp1c:
+	SLTU	X18, X17, X5
+	SLTU	X17, X18, X6
+	JMP	cmp_ret
+cmp1d:
+	SLTU	X20, X19, X5
+	SLTU	X19, X20, X6
+	JMP	cmp_ret
+cmp1e:
+	SLTU	X22, X21, X5
+	SLTU	X21, X22, X6
+	JMP	cmp_ret
+cmp1f:
+	SLTU	X24, X23, X5
+	SLTU	X23, X24, X6
+	JMP	cmp_ret
+cmp1g:
+	SLTU	X28, X25, X5
+	SLTU	X25, X28, X6
+	JMP	cmp_ret
+cmp1h:
+	SLTU	X30, X29, X5
+	SLTU	X29, X30, X6
+	JMP	cmp_ret
+
+cmp_len:
+	MOV	X11, X8
+	MOV	X13, X9
+cmp:
+	SLTU	X9, X8, X5
+	SLTU	X8, X9, X6
+cmp_ret:
+	SUB	X5, X6, X10
+	RET
--- a/src/internal/bytealg/compare_s390x.s
+++ b/src/internal/bytealg/compare_s390x.s
@@ -0,0 +1,69 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+24(FP), R5
+	MOVD	b_len+32(FP), R6
+	LA	ret+48(FP), R7
+	BR	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+16(FP), R5
+	MOVD	b_len+24(FP), R6
+	LA	ret+32(FP), R7
+	BR	cmpbody<>(SB)
+
+// input:
+//   R3 = a
+//   R4 = alen
+//   R5 = b
+//   R6 = blen
+//   R7 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, cmplengths
+	MOVD	R4, R8
+	CMPBLE	R4, R6, amin
+	MOVD	R6, R8
+amin:
+	CMPBEQ	R8, $0, cmplengths
+	CMP	R8, $256
+	BLE	tail
+loop:
+	CLC	$256, 0(R3), 0(R5)
+	BGT	gt
+	BLT	lt
+	SUB	$256, R8
+	MOVD	$256(R3), R3
+	MOVD	$256(R5), R5
+	CMP	R8, $256
+	BGT	loop
+tail:
+	SUB	$1, R8
+	EXRL	$cmpbodyclc<>(SB), R8
+	BGT	gt
+	BLT	lt
+cmplengths:
+	CMP	R4, R6
+	BEQ	eq
+	BLT	lt
+gt:
+	MOVD	$1, 0(R7)
+	RET
+lt:
+	MOVD	$-1, 0(R7)
+	RET
+eq:
+	MOVD	$0, 0(R7)
+	RET
+
+TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
--- a/src/internal/bytealg/compare_wasm.s
+++ b/src/internal/bytealg/compare_wasm.s
@@ -0,0 +1,115 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB), NOSPLIT, $0-56
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+24(FP)
+	I64Load b_len+32(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+48(FP)
+	RET
+
+TEXT runtime·cmpstring(SB), NOSPLIT, $0-40
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+16(FP)
+	I64Load b_len+24(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+32(FP)
+	RET
+
+// params: a, alen, b, blen
+// ret: -1/0/1
+TEXT cmpbody<>(SB), NOSPLIT, $0-0
+	// len = min(alen, blen)
+	Get R1
+	Get R3
+	Get R1
+	Get R3
+	I64LtU
+	Select
+	Set R4
+
+	Get R0
+	I32WrapI64
+	Get R2
+	I32WrapI64
+	Get R4
+	I32WrapI64
+	Call memcmp<>(SB)
+	I64ExtendI32S
+	Tee R5
+
+	I64Eqz
+	If
+		// check length
+		Get R1
+		Get R3
+		I64Sub
+		Set R5
+	End
+
+	I64Const $0
+	I64Const $-1
+	I64Const $1
+	Get R5
+	I64Const $0
+	I64LtS
+	Select
+	Get R5
+	I64Eqz
+	Select
+	Return
+
+// compiled with emscripten
+// params: a, b, len
+// ret: <0/0/>0
+TEXT memcmp<>(SB), NOSPLIT, $0-0
+	Get R2
+	If $1
+	Loop
+	Get R0
+	I32Load8S $0
+	Tee R3
+	Get R1
+	I32Load8S $0
+	Tee R4
+	I32Eq
+	If
+	Get R0
+	I32Const $1
+	I32Add
+	Set R0
+	Get R1
+	I32Const $1
+	I32Add
+	Set R1
+	I32Const $0
+	Get R2
+	I32Const $-1
+	I32Add
+	Tee R2
+	I32Eqz
+	BrIf $3
+	Drop
+	Br $1
+	End
+	End
+	Get R3
+	I32Const $255
+	I32And
+	Get R4
+	I32Const $255
+	I32And
+	I32Sub
+	Else
+	I32Const $0
+	End
+	Return
--- a/src/internal/bytealg/count_amd64.s
+++ b/src/internal/bytealg/count_amd64.s
@@ -0,0 +1,229 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGeneric(SB)
+#endif
+	MOVQ	b_base+0(FP), SI
+	MOVQ	b_len+8(FP), BX
+	MOVB	c+24(FP), AL
+	LEAQ	ret+32(FP), R8
+	JMP	countbody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGenericString(SB)
+#endif
+	MOVQ	s_base+0(FP), SI
+	MOVQ	s_len+8(FP), BX
+	MOVB	c+16(FP), AL
+	LEAQ	ret+24(FP), R8
+	JMP	countbody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+// This function requires the POPCNT instruction.
+TEXT countbody<>(SB),NOSPLIT,$0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ $0, R12 // Accumulator
+
+	MOVQ SI, DI
+
+	CMPQ BX, $64
+	JAE avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+	PCALIGN $16
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Count number of matching bytes
+	POPCNTL DX, DX
+	// Accumulate into R12
+	ADDQ DX, R12
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JBE	sseloop
+
+	// Get the number of bytes to consider in the last 16 bytes
+	ANDQ $15, BX
+	JZ end
+
+	// Create mask to ignore overlap between previous 16 byte block
+	// and the next.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Process the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched so we need to mask part of it.
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+end:
+	MOVQ R12, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	endzero
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	// We must ignore high bytes as they aren't part of our slice.
+	// Create mask.
+	MOVB BX, CX
+	MOVQ $1, R10
+	SALQ CL, R10
+	SUBQ $1, R10
+
+	// Load data
+	MOVOU	(SI), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	MOVQ	DX, (R8)
+	RET
+endzero:
+	MOVQ $0, (R8)
+	RET
+
+endofpage:
+	// We must ignore low bytes as they aren't part of our slice.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Load data into the high end of X1.
+	MOVOU	-16(SI)(BX*1), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	POPCNTL DX, DX
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+#ifndef hasAVX2
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+#endif
+	MOVD AX, X0
+	LEAQ -64(SI)(BX*1), R11
+	LEAQ (SI)(BX*1), R13
+	VPBROADCASTB  X0, Y1
+	PCALIGN $32
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VMOVDQU 32(DI), Y4
+	VPCMPEQB Y1, Y2, Y3
+	VPCMPEQB Y1, Y4, Y5
+	VPMOVMSKB Y3, DX
+	VPMOVMSKB Y5, CX
+	POPCNTL DX, DX
+	POPCNTL CX, CX
+	ADDQ DX, R12
+	ADDQ CX, R12
+	ADDQ $64, DI
+	CMPQ DI, R11
+	JLE avx2_loop
+
+	// If last block is already processed,
+	// skip to the end.
+	//
+	// This check is NOT an optimization; if the input length is a
+	// multiple of 64, we must not go through the last leg of the
+	// function because the bit shift count passed to SALQ below would
+	// be 64, which is outside of the 0-63 range supported by those
+	// instructions.
+	//
+	// Tests in the bytes and strings packages with input lengths that
+	// are multiples of 64 will break if this condition were removed.
+	CMPQ DI, R13
+	JEQ endavx
+
+	// Load address of the last 64 bytes.
+	// There is an overlap with the previous block.
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VMOVDQU 32(DI), Y4
+	VPCMPEQB Y1, Y2, Y3
+	VPCMPEQB Y1, Y4, Y5
+	VPMOVMSKB Y3, DX
+	VPMOVMSKB Y5, CX
+	// Exit AVX mode.
+	VZEROUPPER
+	SALQ $32, CX
+	ORQ CX, DX
+
+	// Create mask to ignore overlap between previous 64 byte block
+	// and the next.
+	ANDQ $63, BX
+	MOVQ $64, CX
+	SUBQ BX, CX
+	MOVQ $0xFFFFFFFFFFFFFFFF, R10
+	SALQ CL, R10
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTQ DX, DX
+	ADDQ DX, R12
+	MOVQ R12, (R8)
+	RET
+endavx:
+	// Exit AVX mode.
+	VZEROUPPER
+	MOVQ R12, (R8)
+	RET
--- a/src/internal/bytealg/count_arm.s
+++ b/src/internal/bytealg/count_arm.s
@@ -0,0 +1,43 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2
+	MOVW	$ret+16(FP), R7
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2
+	MOVW	$ret+12(FP), R7
+	B	countbytebody<>(SB)
+
+// Input:
+// R0: data
+// R1: data length
+// R2: byte to find
+// R7: address to put result
+//
+// On exit:
+// R4 and R8 are clobbered
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	MOVW	$0, R8	// R8 = count of byte to search
+	CMP	$0, R1
+	B.EQ	done	// short path to handle 0-byte case
+	ADD	R0, R1	// R1 is the end of the range
+byte_loop:
+	MOVBU.P	1(R0), R4
+	CMP	R4, R2
+	ADD.EQ	$1, R8
+	CMP	R0, R1
+	B.NE	byte_loop
+done:
+	MOVW	R8, (R7)
+	RET
--- a/src/internal/bytealg/count_arm64.s
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,92 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+	PCALIGN $16
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	PCALIGN $16
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, (R8)
+	RET
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x
+
+package bytealg
+
+func Count(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+
+func CountString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x
+
+package bytealg
+
+//go:noescape
+func Count(b []byte, c byte) int
+
+//go:noescape
+func CountString(s string, c byte) int
+
+// A backup implementation to use by assembly.
+func countGeneric(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+func countGenericString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
--- a/src/internal/bytealg/count_ppc64x.s
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,154 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le || ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = byte array pointer
+	// R4 = length
+	// R6 = byte to count
+	MTVRD	R6, V1		// move compare byte
+	MOVD	R6, R5
+	VSPLTB	$7, V1, V1	// replicate byte across V1
+	BR	countbytebody<>(SB)
+
+TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
+	// R3 = byte array pointer
+	// R4 = length
+	// R5 = byte to count
+	MTVRD	R5, V1		// move compare byte
+	VSPLTB	$7, V1, V1	// replicate byte across V1
+	BR	countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// V1: byte to count, splatted.
+// On exit:
+// R3: return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD	$0, R18 // byte count
+
+#ifndef GOPPC64_power10
+	RLDIMI	$8, R5, $48, R5
+	RLDIMI	$16, R5, $32, R5
+	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
+#endif
+
+	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
+	BLT	tail		// Jump to the small string case
+	SRD	$5, R4, R20
+	MOVD	R20, CTR
+	MOVD	$16, R21
+	XXLXOR	V4, V4, V4
+	XXLXOR	V5, V5, V5
+
+	PCALIGN	$16
+cmploop:
+	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
+	LXVD2X	(R21)(R3), V2
+	VCMPEQUB V2, V1, V2
+	VCMPEQUB V0, V1, V0
+	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
+	VPOPCNTD V0, V0
+	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
+	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
+	ADD	$32, R3
+	BDNZ	cmploop
+
+	VADDUDM	V4, V5, V5
+	MFVSRD	V5, R18
+	VSLDOI	$8, V5, V5, V5
+	MFVSRD	V5, R21
+	ADD	R21, R18, R18
+	ANDCC	$31, R4, R4
+	// Skip the tail processing if no bytes remaining.
+	BEQ	tail_0
+
+#ifdef GOPPC64_power10
+	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.
+
+tail:	// Count the last 0 - 31 bytes.
+	CMP	R4, $16
+	BLE	small_tail_p10
+	LXV	0(R3), V0
+	VCMPEQUB V0, V1, V0
+	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
+	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
+	ADD	R14, R18, R18
+	ADD	$16, R3, R3
+	ANDCC	$15, R4, R4
+
+small_tail_p10:
+	SLD	$56, R4, R6
+	LXVLL	R3, R6, V0
+	VCMPEQUB V0, V1, V0
+	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
+	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
+	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
+	ADD	R14, R18, R3
+	RET
+
+#else
+tail:	// Count the last 0 - 31 bytes.
+	CMP	R4, $16
+	BLT	tail_8
+	MOVD	(R3), R12
+	MOVD	8(R3), R14
+	CMPB	R12, R5, R12
+	CMPB	R14, R5, R14
+	POPCNTD	R12, R12
+	POPCNTD	R14, R14
+	ADD	R12, R18, R18
+	ADD	R14, R18, R18
+	ADD	$16, R3, R3
+	ADD	$-16, R4, R4
+
+tail_8:	// Count the remaining 0 - 15 bytes.
+	CMP	R4, $8
+	BLT	tail_4
+	MOVD	(R3), R12
+	CMPB	R12, R5, R12
+	POPCNTD	R12, R12
+	ADD	R12, R18, R18
+	ADD	$8, R3, R3
+	ADD	$-8, R4, R4
+
+tail_4:	// Count the remaining 0 - 7 bytes.
+	CMP	R4, $4
+	BLT	tail_2
+	MOVWZ	(R3), R12
+	CMPB	R12, R5, R12
+	SLD	$32, R12, R12	// Remove non-participating matches.
+	POPCNTD	R12, R12
+	ADD	R12, R18, R18
+	ADD	$4, R3, R3
+	ADD	$-4, R4, R4
+
+tail_2:	// Count the remaining 0 - 3 bytes.
+	CMP	R4, $2
+	BLT	tail_1
+	MOVHZ	(R3), R12
+	CMPB	R12, R5, R12
+	SLD	$48, R12, R12	// Remove non-participating matches.
+	POPCNTD	R12, R12
+	ADD	R12, R18, R18
+	ADD	$2, R3, R3
+	ADD	$-2, R4, R4
+
+tail_1:	// Count the remaining 0 - 1 bytes.
+	CMP	R4, $1
+	BLT	tail_0
+	MOVBZ	(R3), R12
+	CMPB	R12, R5, R12
+	ANDCC	$0x8, R12, R12
+	ADD	R12, R18, R18
+#endif
+
+tail_0:	// No remaining tail to count.
+	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
+	RET
--- a/src/internal/bytealg/count_riscv64.s
+++ b/src/internal/bytealg/count_riscv64.s
@@ -0,0 +1,49 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = b_cap (unused)
+	// X13 = byte to count (want in X12)
+	AND	$0xff, X13, X12
+	MOV	ZERO, X14	// count
+	ADD	X10, X11	// end
+
+	PCALIGN	$16
+loop:
+	BEQ	X10, X11, done
+	MOVBU	(X10), X15
+	ADD	$1, X10
+	BNE	X12, X15, loop
+	ADD	$1, X14
+	JMP	loop
+
+done:
+	MOV	X14, X10
+	RET
+
+TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+	// X10 = s_base
+	// X11 = s_len
+	// X12 = byte to count
+	AND	$0xff, X12
+	MOV	ZERO, X14	// count
+	ADD	X10, X11	// end
+
+	PCALIGN	$16
+loop:
+	BEQ	X10, X11, done
+	MOVBU	(X10), X15
+	ADD	$1, X10
+	BNE	X12, X15, loop
+	ADD	$1, X14
+	JMP	loop
+
+done:
+	MOV	X14, X10
+	RET
--- a/src/internal/bytealg/count_s390x.s
+++ b/src/internal/bytealg/count_s390x.s
@@ -0,0 +1,169 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// condition code masks
+#define EQ 8
+#define NE 7
+
+// register assignments
+#define R_ZERO R0
+#define R_VAL  R1
+#define R_TMP  R2
+#define R_PTR  R3
+#define R_LEN  R4
+#define R_CHAR R5
+#define R_RET  R6
+#define R_ITER R7
+#define R_CNT  R8
+#define R_MPTR R9
+
+// vector register assignments
+#define V_ZERO V0
+#define V_CHAR V1
+#define V_MASK V2
+#define V_VAL  V3
+#define V_CNT  V4
+
+// mask for trailing bytes in vector implementation
+GLOBL countbytemask<>(SB), RODATA, $16
+DATA countbytemask<>+0(SB)/8, $0x0101010101010101
+DATA countbytemask<>+8(SB)/8, $0x0101010101010101
+
+// func Count(b []byte, c byte) int
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	LMG   b+0(FP), R_PTR, R_LEN
+	MOVBZ c+24(FP), R_CHAR
+	MOVD  $ret+32(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// func CountString(s string, c byte) int
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	LMG   s+0(FP), R_PTR, R_LEN
+	MOVBZ c+16(FP), R_CHAR
+	MOVD  $ret+24(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// input:
+// R_PTR  = address of array of bytes
+// R_LEN  = number of bytes in array
+// R_CHAR = byte value to count zero (extended to register width)
+// R_RET  = address of return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
+	MOVD  $countbytemask<>(SB), R_MPTR
+	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
+	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
+	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
+	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
+
+	// Start of vector code (have vector facility).
+	//
+	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
+	// vector 'load with length' (VLL). It will be in the range [-1,14].
+	// Also replicate c across a 16-byte vector and initialize V_ZERO.
+	ANDW  $0xf, R_LEN
+	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
+	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
+	ADDW  $-1, R_LEN
+	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
+
+	// Jump to loop if we have more than 15 bytes to process.
+	CGIJ $NE, R_ITER, $0, vxchunks
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+vxchunks:
+	// Load 0x01 into every byte element in the 16-byte mask vector.
+	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
+	VZERO  V_CNT      // initial uint128 count of 0
+
+vxloop:
+	// Load input bytes in 16-byte chunks.
+	VL (R_PTR), V_VAL
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Increment input string address.
+	MOVD $16(R_PTR), R_PTR
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+	// Repeat until all 16-byte chunks are done.
+	BRCTG R_ITER, vxloop
+
+	// Skip to end if there are no trailing bytes.
+	CIJ $EQ, R_LEN, $-1, vxret
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL
+	VN    V_MASK, V_VAL, V_VAL
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+vxret:
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+novx:
+	// Start of non-vector code (the vector facility not available).
+	//
+	// Initialise counter and constant zero.
+	MOVD $0, R_CNT
+	MOVD $0, R_ZERO
+
+loop:
+	// Read 1-byte from input and compare.
+	// Note: avoid putting LOCGR in critical path.
+	MOVBZ (R_PTR), R_VAL
+	MOVD  $1, R_TMP
+	MOVD  $1(R_PTR), R_PTR
+	CMPW  R_VAL, R_CHAR
+	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
+	ADD   R_TMP, R_CNT       // accumulate 64-bit result
+
+	// Repeat until all bytes have been checked.
+	BRCTG R_LEN, loop
+
+ret:
+	MOVD R_CNT, (R_RET)
+	RET
+
+ret0:
+	MOVD $0, (R_RET)
+	RET
--- a/src/internal/bytealg/equal_386.s
+++ b/src/internal/bytealg/equal_386.s
@@ -0,0 +1,130 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVL	a+0(FP), SI
+	MOVL	b+4(FP), DI
+	CMPL	SI, DI
+	JEQ	eq
+	MOVL	size+8(FP), BX
+	LEAL	ret+12(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVL    a+0(FP), SI
+	MOVL    b+4(FP), DI
+	CMPL    SI, DI
+	JEQ     eq
+	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
+	LEAL	ret+8(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+8(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+// address of result byte in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPL	BX, $4
+	JB	small
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPL	BX, $64
+	JB	bigloop
+#ifdef GO386_softfloat
+	JMP	bigloop
+#endif
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDL	$64, SI
+	ADDL	$64, DI
+	SUBL	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	MOVB	$0, (AX)
+	RET
+
+	// 4 bytes at a time using 32-bit register
+bigloop:
+	CMPL	BX, $4
+	JBE	leftover
+	MOVL	(SI), CX
+	MOVL	(DI), DX
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BX
+	CMPL	CX, DX
+	JEQ	bigloop
+	MOVB	$0, (AX)
+	RET
+
+	// remaining 0-4 bytes
+leftover:
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	-4(DI)(BX*1), DX
+	CMPL	CX, DX
+	SETEQ	(AX)
+	RET
+
+small:
+	CMPL	BX, $0
+	JEQ	equal
+
+	LEAL	0(BX*8), CX
+	NEGL	CX
+
+	MOVL	SI, DX
+	CMPB	DX, $0xfc
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 111111xx. Load up to bytes we want, move to correct position.
+	MOVL	-4(SI)(BX*1), SI
+	SHRL	CX, SI
+si_finish:
+
+	// same for DI.
+	MOVL	DI, DX
+	CMPB	DX, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BX*1), DI
+	SHRL	CX, DI
+di_finish:
+
+	SUBL	SI, DI
+	SHLL	CX, DI
+equal:
+	SETEQ	(AX)
+	RET
--- a/src/internal/bytealg/equal_amd64.s
+++ b/src/internal/bytealg/equal_amd64.s
@@ -0,0 +1,165 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
+	// AX = a    (want in SI)
+	// BX = b    (want in DI)
+	// CX = size (want in BX)
+	CMPQ	AX, BX
+	JNE	neq
+	MOVQ	$1, AX	// return 1
+	RET
+neq:
+	MOVQ	AX, SI
+	MOVQ	BX, DI
+	MOVQ	CX, BX
+	JMP	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+	// AX = a       (want in SI)
+	// BX = b       (want in DI)
+	// 8(DX) = size (want in BX)
+	CMPQ	AX, BX
+	JNE	neq
+	MOVQ	$1, AX	// return 1
+	RET
+neq:
+	MOVQ	AX, SI
+	MOVQ	BX, DI
+	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
+	JMP	memeqbody<>(SB)
+
+// Input:
+//   a in SI
+//   b in DI
+//   count in BX
+// Output:
+//   result in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPQ	BX, $8
+	JB	small
+	CMPQ	BX, $64
+	JB	bigloop
+#ifndef hasAVX2
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE	hugeloop_avx2
+
+	// 64 bytes at a time using xmm registers
+	PCALIGN $16
+hugeloop:
+	CMPQ	BX, $64
+	JB	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	XORQ	AX, AX	// return 0
+	RET
+#endif
+
+	// 64 bytes at a time using ymm registers
+	PCALIGN $16
+hugeloop_avx2:
+	CMPQ	BX, $64
+	JB	bigloop_avx2
+	VMOVDQU	(SI), Y0
+	VMOVDQU	(DI), Y1
+	VMOVDQU	32(SI), Y2
+	VMOVDQU	32(DI), Y3
+	VPCMPEQB	Y1, Y0, Y4
+	VPCMPEQB	Y2, Y3, Y5
+	VPAND	Y4, Y5, Y6
+	VPMOVMSKB Y6, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffffffff
+	JEQ	hugeloop_avx2
+	VZEROUPPER
+	XORQ	AX, AX	// return 0
+	RET
+
+bigloop_avx2:
+	VZEROUPPER
+
+	// 8 bytes at a time using 64-bit register
+	PCALIGN $16
+bigloop:
+	CMPQ	BX, $8
+	JBE	leftover
+	MOVQ	(SI), CX
+	MOVQ	(DI), DX
+	ADDQ	$8, SI
+	ADDQ	$8, DI
+	SUBQ	$8, BX
+	CMPQ	CX, DX
+	JEQ	bigloop
+	XORQ	AX, AX	// return 0
+	RET
+
+	// remaining 0-8 bytes
+leftover:
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	-8(DI)(BX*1), DX
+	CMPQ	CX, DX
+	SETEQ	AX
+	RET
+
+small:
+	CMPQ	BX, $0
+	JEQ	equal
+
+	LEAQ	0(BX*8), CX
+	NEGQ	CX
+
+	CMPB	SI, $0xf8
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
+	MOVQ	-8(SI)(BX*1), SI
+	SHRQ	CX, SI
+si_finish:
+
+	// same for DI.
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(BX*1), DI
+	SHRQ	CX, DI
+di_finish:
+
+	SUBQ	SI, DI
+	SHLQ	CX, DI
+equal:
+	SETEQ	AX
+	RET
--- a/src/internal/bytealg/equal_arm.s
+++ b/src/internal/bytealg/equal_arm.s
@@ -0,0 +1,91 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-13
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	size+8(FP), R1
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+12(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-9
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	4(R7), R1	// compiler stores size at offset 4 in the closure
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+8(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+8(FP)
+	RET
+
+// Input:
+// R0: data of a
+// R1: length
+// R2: data of b
+// R7: points to return value
+//
+// On exit:
+// R4, R5 and R6 are clobbered
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	$1, R1
+	B.EQ	one		// 1-byte special case for better performance
+
+	CMP	$4, R1
+	ADD	R0, R1		// R1 is the end of the range to compare
+	B.LT	byte_loop	// length < 4
+	AND	$3, R0, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+	AND	$3, R2, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R1, R6
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R0), R4
+	MOVW.P	4(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R6
+	B.NE	chunk4_loop
+	CMP	R0, R1
+	B.EQ	equal		// reached the end
+byte_loop:
+	MOVBU.P	1(R0), R4
+	MOVBU.P	1(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R1
+	B.NE	byte_loop
+equal:
+	MOVW	$1, R0
+	MOVB	R0, (R7)
+	RET
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	B.EQ	equal
+notequal:
+	MOVW	$0, R0
+	MOVB	R0, (R7)
+	RET
--- a/src/internal/bytealg/equal_arm64.s
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,124 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	// short path to handle 0-byte case
+	CBZ	R2, equal
+	// short path to handle equal pointers
+	CMP	R0, R1
+	BEQ	equal
+	B	memeqbody<>(SB)
+equal:
+	MOVD	$1, R0
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+	CMP	R0, R1
+	BEQ	eq
+	MOVD	8(R26), R2    // compiler stores size at offset 8 in the closure
+	CBZ	R2, eq
+	B	memeqbody<>(SB)
+eq:
+	MOVD	$1, R0
+	RET
+
+// input:
+// R0: pointer a
+// R1: pointer b
+// R2: data len
+// at return: result in R0
+TEXT memeqbody<>(SB),NOSPLIT,$0
+	CMP	$1, R2
+	// handle 1-byte special case for better performance
+	BEQ	one
+	CMP	$16, R2
+	// handle specially if length < 16
+	BLO	tail
+	BIC	$0x3f, R2, R3
+	CBZ	R3, chunk16
+	// work with 64-byte chunks
+	ADD	R3, R0, R6	// end of chunks
+chunk64_loop:
+	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+	VLD1.P	(R1), [V4.D2, V5.D2, V6.D2, V7.D2]
+	VCMEQ	V0.D2, V4.D2, V8.D2
+	VCMEQ	V1.D2, V5.D2, V9.D2
+	VCMEQ	V2.D2, V6.D2, V10.D2
+	VCMEQ	V3.D2, V7.D2, V11.D2
+	VAND	V8.B16, V9.B16, V8.B16
+	VAND	V8.B16, V10.B16, V8.B16
+	VAND	V8.B16, V11.B16, V8.B16
+	CMP	R0, R6
+	VMOV	V8.D[0], R4
+	VMOV	V8.D[1], R5
+	CBZ	R4, not_equal
+	CBZ	R5, not_equal
+	BNE	chunk64_loop
+	AND	$0x3f, R2, R2
+	CBZ	R2, equal
+chunk16:
+	// work with 16-byte chunks
+	BIC	$0xf, R2, R3
+	CBZ	R3, tail
+	ADD	R3, R0, R6	// end of chunks
+chunk16_loop:
+	LDP.P	16(R0), (R4, R5)
+	LDP.P	16(R1), (R7, R9)
+	EOR	R4, R7
+	CBNZ	R7, not_equal
+	EOR	R5, R9
+	CBNZ	R9, not_equal
+	CMP	R0, R6
+	BNE	chunk16_loop
+	AND	$0xf, R2, R2
+	CBZ	R2, equal
+tail:
+	// special compare of tail with length < 16
+	TBZ	$3, R2, lt_8
+	MOVD	(R0), R4
+	MOVD	(R1), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$8, R2, R6	// offset of the last 8 bytes
+	MOVD	(R0)(R6), R4
+	MOVD	(R1)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_8:
+	TBZ	$2, R2, lt_4
+	MOVWU	(R0), R4
+	MOVWU	(R1), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$4, R2, R6	// offset of the last 4 bytes
+	MOVWU	(R0)(R6), R4
+	MOVWU	(R1)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_4:
+	TBZ	$1, R2, lt_2
+	MOVHU.P	2(R0), R4
+	MOVHU.P	2(R1), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_2:
+	TBZ	$0, R2, equal
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R1), R5
+	CMP	R4, R5
+	BNE	not_equal
+equal:
+	MOVD	$1, R0
+	RET
+not_equal:
+	MOVB	ZR, R0
+	RET
--- a/src/internal/bytealg/equal_generic.go
+++ b/src/internal/bytealg/equal_generic.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Equal reports whether a and b
+// are the same length and contain the same bytes.
+// A nil argument is equivalent to an empty slice.
+//
+// Equal is equivalent to bytes.Equal.
+// It is provided here for convenience,
+// because some packages cannot depend on bytes.
+func Equal(a, b []byte) bool {
+	// Neither cmd/compile nor gccgo allocates for these string conversions.
+	// There is a test for this in package bytes.
+	return string(a) == string(b)
+}
--- a/src/internal/bytealg/equal_loong64.s
+++ b/src/internal/bytealg/equal_loong64.s
@@ -0,0 +1,44 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R29
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	BEQ	R4, R5, eq
+	ADDV	R4, R6, R7
+	PCALIGN	$16
+loop:
+	BNE	R4, R7, test
+	MOVV	$1, R4
+	RET
+test:
+	MOVBU	(R4), R9
+	ADDV	$1, R4
+	MOVBU	(R5), R10
+	ADDV	$1, R5
+	BEQ	R9, R10, loop
+
+	MOVB    R0, R4
+	RET
+eq:
+	MOVV	$1, R4
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17
+	BEQ	R4, R5, eq
+	MOVV	8(REGCTXT), R6    // compiler stores size at offset 8 in the closure
+	MOVV	R4, 8(R3)
+	MOVV	R5, 16(R3)
+	MOVV	R6, 24(R3)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R3), R4
+	RET
+eq:
+	MOVV	$1, R4
+	RET
--- a/src/internal/bytealg/equal_mips64x.s
+++ b/src/internal/bytealg/equal_mips64x.s
@@ -0,0 +1,118 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	size+16(FP), R3
+	ADDV	R1, R3, R4
+
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BEQ	R0, R8, chunk_entry
+
+byte_loop:
+	BNE	R1, R4, byte_test
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+byte_test:
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, byte_loop
+	JMP	not_eq
+
+chunk_entry:
+	// make sure both a and b are aligned
+	OR	R1, R2, R9
+	AND	$0x7, R9
+	BNE	R0, R9, byte_loop
+	JMP	chunk_loop_1
+
+chunk_loop:
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BNE	R0, R8, chunk_tail_8
+chunk_loop_1:
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	MOVV	8(R1), R12
+	MOVV	8(R2), R13
+	ADDV	$16, R1
+	ADDV	$16, R2
+	SUBV	$16, R3
+	BEQ	R12, R13, chunk_loop
+	JMP	not_eq
+
+chunk_tail_8:
+	AND	$8, R3, R14
+	BEQ	R0, R14, chunk_tail_4
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$8, R1
+	ADDV	$8, R2
+
+chunk_tail_4:
+	AND	$4, R3, R14
+	BEQ	R0, R14, chunk_tail_2
+	MOVWU	(R1), R6
+	MOVWU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$4, R1
+	ADDV	$4, R2
+
+chunk_tail_2:
+	AND	$2, R3, R14
+	BEQ	R0, R14, chunk_tail_1
+	MOVHU	(R1), R6
+	MOVHU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$2, R1
+	ADDV	$2, R2
+
+chunk_tail_1:
+	AND	$1, R3, R14
+	BEQ	R0, R14, eq
+	MOVBU	(R1), R6
+	MOVBU	(R2), R7
+	BEQ	R6, R7, eq
+
+not_eq:
+	MOVB	R0, ret+24(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	8(REGCTXT), R3    // compiler stores size at offset 8 in the closure
+	MOVV	R1, 8(R29)
+	MOVV	R2, 16(R29)
+	MOVV	R3, 24(R29)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R29), R1
+	MOVB	R1, ret+16(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	RET
--- a/src/internal/bytealg/equal_mipsx.s
+++ b/src/internal/bytealg/equal_mipsx.s
@@ -0,0 +1,62 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	size+8(FP), R3
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+12(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	4(REGCTXT), R3	// compiler stores size at offset 4 in the closure
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+8(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@@ -0,0 +1,21 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "unsafe"
+
+// The declarations below generate ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+// The compiler generates calls to runtime.memequal and runtime.memequal_varlen.
+// In addition, the runtime calls runtime.memequal explicitly.
+// Those functions are implemented in this package.
+
+//go:linkname abigen_runtime_memequal runtime.memequal
+func abigen_runtime_memequal(a, b unsafe.Pointer, size uintptr) bool
+
+//go:linkname abigen_runtime_memequal_varlen runtime.memequal_varlen
+func abigen_runtime_memequal_varlen(a, b unsafe.Pointer) bool
--- a/src/internal/bytealg/equal_ppc64x.s
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -0,0 +1,207 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// 4K (smallest case) page size offset mask for PPC64.
+#define PAGE_OFFSET 4095
+
+// Likewise, the BC opcode is hard to read, and no extended
+// mnemonics are offered for these forms.
+#define BGELR_CR6 BC  4, CR6LT, (LR)
+#define BEQLR     BC 12, CR0EQ, (LR)
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	// R3 = a
+	// R4 = b
+	// R5 = size
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+	// R3 = a
+	// R4 = b
+	CMP	R3, R4
+	BEQ	eq
+	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
+	BR	memeqbody<>(SB)
+eq:
+	MOVD	$1, R3
+	RET
+
+// Do an efficient memequal for ppc64
+// R3 = s1
+// R4 = s2
+// R5 = len
+// On exit:
+// R3 = return value
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3, R8		// Move s1 into R8
+	ADD	R5, R3, R9	// &s1[len(s1)]
+	ADD	R5, R4, R10	// &s2[len(s2)]
+	MOVD	$1, R11
+	CMP	R5, $16		// Use GPR checks for check for len <= 16
+	BLE	check0_16
+	MOVD	$0, R3		// Assume no-match in case BGELR CR6 returns
+	CMP	R5, $32		// Use overlapping VSX loads for len <= 32
+	BLE	check17_32	// Do a pair of overlapping VSR compares
+	CMP	R5, $64
+	BLE	check33_64	// Hybrid check + overlap compare.
+
+setup64:
+	SRD	$6, R5, R6	// number of 64 byte chunks to compare
+	MOVD	R6, CTR
+	MOVD	$16, R14	// index for VSX loads and stores
+	MOVD	$32, R15
+	MOVD	$48, R16
+	ANDCC	$0x3F, R5, R5	// len%64==0?
+
+	PCALIGN $16
+loop64:
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
+	BGELR_CR6
+	LXVD2X	(R8+R14), V0
+	LXVD2X	(R4+R14), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R15), V0
+	LXVD2X	(R4+R15), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R16), V0
+	LXVD2X	(R4+R16), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	ADD	$64,R8		// bump up to next 64
+	ADD	$64,R4
+	BDNZ	loop64
+
+	ISEL	CR0EQ, R11, R3, R3	// If no tail, return 1, otherwise R3 remains 0.
+	BEQLR				// return if no tail.
+
+	ADD	$-64, R9, R8
+	ADD	$-64, R10, R4
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R14), V0
+	LXVD2X	(R4+R14), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R15), V0
+	LXVD2X	(R4+R15), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R16), V0
+	LXVD2X	(R4+R16), V1
+	VCMPEQUBCC	V0, V1, V2
+	ISEL	CR6LT, R11, R0, R3
+	RET
+
+check33_64:
+	// Bytes 0-15
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	ADD	$16, R8
+	ADD	$16, R4
+
+	// Bytes 16-31
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+
+	// A little tricky, but point R4,R8 to &sx[len-32],
+	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
+	ADD	$-32, R9, R8
+	ADD	$-32, R10, R4
+	// Fallthrough
+
+check17_32:
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	ISEL	CR6LT, R11, R0, R5
+
+	// Load sX[len(sX)-16:len(sX)] and compare.
+	ADD	$-16, R9
+	ADD	$-16, R10
+	LXVD2X	(R9+R0), V0
+	LXVD2X	(R10+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	ISEL	CR6LT, R5, R0, R3
+	RET
+
+check0_16:
+#ifdef GOPPC64_power10
+	SLD	$56, R5, R7
+	LXVL	R8, R7, V0
+	LXVL	R4, R7, V1
+	VCMPEQUDCC	V0, V1, V2
+	ISEL	CR6LT, R11, R0, R3
+	RET
+#else
+	CMP	R5, $8
+	BLT	check0_7
+	// Load sX[0:7] and compare.
+	MOVD	(R8), R6
+	MOVD	(R4), R7
+	CMP	R6, R7
+	ISEL	CR0EQ, R11, R0, R5
+	// Load sX[len(sX)-8:len(sX)] and compare.
+	MOVD	-8(R9), R6
+	MOVD	-8(R10), R7
+	CMP	R6, R7
+	ISEL	CR0EQ, R5, R0, R3
+	RET
+
+check0_7:
+	CMP	R5,$0
+	MOVD	$1, R3
+	BEQLR		// return if len == 0
+
+	// Check < 8B loads with a single compare, but select the load address
+	// such that it cannot cross a page boundary. Load a few bytes from the
+	// lower address if that does not cross the lower page. Or, load a few
+	// extra bytes from the higher addresses. And align those values
+	// consistently in register as either address may have differing
+	// alignment requirements.
+	ANDCC	$PAGE_OFFSET, R8, R6	// &sX & PAGE_OFFSET
+	ANDCC	$PAGE_OFFSET, R4, R9
+	SUBC	R5, $8, R12		// 8-len
+	SLD	$3, R12, R14		// (8-len)*8
+	CMPU	R6, R12, CR1		// Enough bytes lower in the page to load lower?
+	CMPU	R9, R12, CR0
+	SUB	R12, R8, R6		// compute lower load address
+	SUB	R12, R4, R9
+	ISEL	CR1LT, R8, R6, R8	// R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
+	ISEL	CR0LT, R4, R9, R4	// Similar for s2
+	MOVD	(R8), R15
+	MOVD	(R4), R16
+	SLD	R14, R15, R7
+	SLD	R14, R16, R17
+	SRD	R14, R7, R7		// Clear the upper (8-len) bytes (with 2 shifts)
+	SRD	R14, R17, R17
+	SRD	R14, R15, R6		// Clear the lower (8-len) bytes
+	SRD	R14, R16, R9
+#ifdef GOARCH_ppc64le
+	ISEL	CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
+	ISEL	CR0LT, R17, R9, R4
+#else
+	ISEL	CR1LT, R6, R7, R8
+	ISEL	CR0LT, R9, R17, R4
+#endif
+	CMP	R4, R8
+	ISEL	CR0EQ, R11, R0, R3
+	RET
+#endif	// tail processing if !defined(GOPPC64_power10)
--- a/src/internal/bytealg/equal_riscv64.s
+++ b/src/internal/bytealg/equal_riscv64.s
@@ -0,0 +1,126 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	CTXT	S10
+
+// func memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	// X10 = a_base
+	// X11 = b_base
+	// X12 = size
+	JMP	memequal<>(SB)
+
+// func memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+	MOV	8(CTXT), X12    // compiler stores size at offset 8 in the closure
+	// X10 = a_base
+	// X11 = b_base
+	JMP	memequal<>(SB)
+
+// On entry X10 and X11 contain pointers, X12 contains length.
+// For non-regabi X13 contains address for return value.
+// For regabi return value in X10.
+TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	X10, X11, eq
+
+	MOV	$32, X23
+	BLT	X12, X23, loop4_check
+
+	// Check alignment - if alignment differs we have to do one byte at a time.
+	AND	$7, X10, X9
+	AND	$7, X11, X19
+	BNE	X9, X19, loop4_check
+	BEQZ	X9, loop32_check
+
+	// Check one byte at a time until we reach 8 byte alignment.
+	SUB	X9, X0, X9
+	ADD	$8, X9, X9
+	SUB	X9, X12, X12
+align:
+	SUB	$1, X9
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	BNE	X19, X20, not_eq
+	ADD	$1, X10
+	ADD	$1, X11
+	BNEZ	X9, align
+
+loop32_check:
+	MOV	$32, X9
+	BLT	X12, X9, loop16_check
+loop32:
+	MOV	0(X10), X19
+	MOV	0(X11), X20
+	MOV	8(X10), X21
+	MOV	8(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	MOV	16(X10), X14
+	MOV	16(X11), X15
+	MOV	24(X10), X16
+	MOV	24(X11), X17
+	BNE	X14, X15, not_eq
+	BNE	X16, X17, not_eq
+	ADD	$32, X10
+	ADD	$32, X11
+	SUB	$32, X12
+	BGE	X12, X9, loop32
+	BEQZ	X12, eq
+
+loop16_check:
+	MOV	$16, X23
+	BLT	X12, X23, loop4_check
+loop16:
+	MOV	0(X10), X19
+	MOV	0(X11), X20
+	MOV	8(X10), X21
+	MOV	8(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	ADD	$16, X10
+	ADD	$16, X11
+	SUB	$16, X12
+	BGE	X12, X23, loop16
+	BEQZ	X12, eq
+
+loop4_check:
+	MOV	$4, X23
+	BLT	X12, X23, loop1
+loop4:
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	MOVBU	1(X10), X21
+	MOVBU	1(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	MOVBU	2(X10), X14
+	MOVBU	2(X11), X15
+	MOVBU	3(X10), X16
+	MOVBU	3(X11), X17
+	BNE	X14, X15, not_eq
+	BNE	X16, X17, not_eq
+	ADD	$4, X10
+	ADD	$4, X11
+	SUB	$4, X12
+	BGE	X12, X23, loop4
+
+loop1:
+	BEQZ	X12, eq
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	BNE	X19, X20, not_eq
+	ADD	$1, X10
+	ADD	$1, X11
+	SUB	$1, X12
+	JMP	loop1
+
+not_eq:
+	MOVB	ZERO, X10
+	RET
+eq:
+	MOV	$1, X10
+	RET
--- a/src/internal/bytealg/equal_s390x.s
+++ b/src/internal/bytealg/equal_s390x.s
@@ -0,0 +1,92 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	size+16(FP), R6
+	LA	ret+24(FP), R7
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	8(R12), R6    // compiler stores size at offset 8 in the closure
+	LA	ret+16(FP), R7
+	BR	memeqbody<>(SB)
+
+// input:
+//   R3 = a
+//   R5 = b
+//   R6 = len
+//   R7 = address of output byte (stores 0 or 1 here)
+//   a and b have the same length
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, equal
+loop:
+	CMPBEQ	R6, $0, equal
+	CMPBLT	R6, $32, tiny
+	CMP	R6, $256
+	BLT	tail
+	CLC	$256, 0(R3), 0(R5)
+	BNE	notequal
+	SUB	$256, R6
+	LA	256(R3), R3
+	LA	256(R5), R5
+	BR	loop
+tail:
+	SUB	$1, R6, R8
+	EXRL	$memeqbodyclc<>(SB), R8
+	BEQ	equal
+notequal:
+	MOVB	$0, 0(R7)
+	RET
+equal:
+	MOVB	$1, 0(R7)
+	RET
+tiny:
+	MOVD	$0, R2
+	CMPBLT	R6, $16, lt16
+	MOVD	0(R3), R8
+	MOVD	0(R5), R9
+	CMPBNE	R8, R9, notequal
+	MOVD	8(R3), R8
+	MOVD	8(R5), R9
+	CMPBNE	R8, R9, notequal
+	LA	16(R2), R2
+	SUB	$16, R6
+lt16:
+	CMPBLT	R6, $8, lt8
+	MOVD	0(R3)(R2*1), R8
+	MOVD	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	8(R2), R2
+	SUB	$8, R6
+lt8:
+	CMPBLT	R6, $4, lt4
+	MOVWZ	0(R3)(R2*1), R8
+	MOVWZ	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	4(R2), R2
+	SUB	$4, R6
+lt4:
+#define CHECK(n) \
+	CMPBEQ	R6, $n, equal \
+	MOVB	n(R3)(R2*1), R8 \
+	MOVB	n(R5)(R2*1), R9 \
+	CMPBNE	R8, R9, notequal
+	CHECK(0)
+	CHECK(1)
+	CHECK(2)
+	CHECK(3)
+	BR	equal
+
+TEXT memeqbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
--- a/src/internal/bytealg/equal_wasm.s
+++ b/src/internal/bytealg/equal_wasm.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB), NOSPLIT, $0-25
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load size+16(FP)
+	Call memeqbody<>(SB)
+	I64Store8 ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB), NOSPLIT, $0-17
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load 8(CTXT) // compiler stores size at offset 8 in the closure
+	Call memeqbody<>(SB)
+	I64Store8 ret+16(FP)
+	RET
+
+// params: a, b, len
+// ret: 0/1
+TEXT memeqbody<>(SB), NOSPLIT, $0-0
+	Get R0
+	Get R1
+	I64Eq
+	If
+		I64Const $1
+		Return
+	End
+
+loop:
+	Loop
+		Get R2
+		I64Eqz
+		If
+			I64Const $1
+			Return
+		End
+
+		Get R0
+		I32WrapI64
+		I64Load8U $0
+		Get R1
+		I32WrapI64
+		I64Load8U $0
+		I64Ne
+		If
+			I64Const $0
+			Return
+		End
+
+		Get R0
+		I64Const $1
+		I64Add
+		Set R0
+
+		Get R1
+		I64Const $1
+		I64Add
+		Set R1
+
+		Get R2
+		I64Const $1
+		I64Sub
+		Set R2
+
+		Br loop
+	End
+	UNDEF
--- a/src/internal/bytealg/index_amd64.go
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		MaxLen = 63
+	} else {
+		MaxLen = 31
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
--- a/src/internal/bytealg/index_amd64.s
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,278 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+24(FP), R8
+	MOVQ b_len+32(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+48(FP), R11
+	JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+16(FP), R8
+	MOVQ b_len+24(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+32(FP), R11
+	JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// R8: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (R8), R8
+	LEAQ -1(DI)(DX*1), DX
+	PCALIGN $16
+loop2:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(R8), BX
+	MOVW (R8), R8
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (R8), R8
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(R8)(AX*1), BX
+	MOVL (R8), R8
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (R8), R8
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $15
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(R8)(AX*1), BX
+	MOVQ (R8), R8
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_or_more
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_or_more:
+	CMPQ AX, $31
+	JA   _32_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(R8)(AX*1), X0
+	MOVOU (R8), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+	CMPQ AX, $32
+	JA   _33_to_63
+	VMOVDQU (R8), Y1
+	LEAQ -31(DI)(DX*1), DX
+loop32:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop32
+	JMP fail_avx2
+_33_to_63:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	VMOVDQU -32(R8)(AX*1), Y0
+	VMOVDQU (R8), Y1
+loop33to63:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   partial_success33to63
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+	JMP fail_avx2
+partial_success33to63:
+	VMOVDQU -32(AX)(DI*1), Y3
+	VPCMPEQB Y0, Y3, Y4
+	VPMOVMSKB Y4, SI
+	CMPL  SI, $0xffffffff
+	JE success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+fail_avx2:
+	VZEROUPPER
+fail:
+	MOVQ $-1, (R11)
+	RET
+success_avx2:
+	VZEROUPPER
+	JMP success
+sse42:
+#ifndef hasSSE42
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
+	JNE no_sse42
+#endif
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(R8), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+	PCALIGN $16
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ R10, DI
+	MOVQ DI, (R11)
+	RET
--- a/src/internal/bytealg/index_arm64.go
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+	// Optimize cases where the length of the substring is less than 32 bytes
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 16 characters, plus a few slop to start.
+	return 4 + n>>4
+}
--- a/src/internal/bytealg/index_arm64.s
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,206 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	MOVD	$ret+48(FP), R9
+	B	indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+16(FP), R2
+	MOVD	b_len+24(FP), R3
+	MOVD	$ret+32(FP), R9
+	B	indexbody<>(SB)
+
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 32)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+	// main idea is to load 'sep' into separate register(s)
+	// to avoid repeatedly re-load it again and again
+	// for sebsequent substring comparisons
+	SUB	R3, R1, R4
+	// R4 contains the start of last substring for comparison
+	ADD	R0, R4, R4
+	ADD	$1, R0, R8
+
+	CMP	$8, R3
+	BHI	greater_8
+	TBZ	$3, R3, len_2_7
+len_8:
+	// R5 contains 8-byte of sep
+	MOVD	(R2), R5
+loop_8:
+	// R6 contains substring for comparison
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_8
+	B	found
+len_2_7:
+	TBZ	$2, R3, len_2_3
+	TBZ	$1, R3, len_4_5
+	TBZ	$0, R3, len_6
+len_7:
+	// R5 and R6 contain 7-byte of sep
+	MOVWU	(R2), R5
+	// 1-byte overlap with R5
+	MOVWU	3(R2), R6
+loop_7:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_7
+	MOVWU	2(R0), R3
+	CMP	R6, R3
+	BNE	loop_7
+	B	found
+len_6:
+	// R5 and R6 contain 6-byte of sep
+	MOVWU	(R2), R5
+	MOVHU	4(R2), R6
+loop_6:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_6
+	MOVHU	3(R0), R3
+	CMP	R6, R3
+	BNE	loop_6
+	B	found
+len_4_5:
+	TBZ	$0, R3, len_4
+len_5:
+	// R5 and R7 contain 5-byte of sep
+	MOVWU	(R2), R5
+	MOVBU	4(R2), R7
+loop_5:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_5
+	MOVBU	3(R0), R3
+	CMP	R7, R3
+	BNE	loop_5
+	B	found
+len_4:
+	// R5 contains 4-byte of sep
+	MOVWU	(R2), R5
+loop_4:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_4
+	B	found
+len_2_3:
+	TBZ	$0, R3, len_2
+len_3:
+	// R6 and R7 contain 3-byte of sep
+	MOVHU	(R2), R6
+	MOVBU	2(R2), R7
+loop_3:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R3
+	CMP	R6, R3
+	BNE	loop_3
+	MOVBU	1(R0), R3
+	CMP	R7, R3
+	BNE	loop_3
+	B	found
+len_2:
+	// R5 contains 2-byte of sep
+	MOVHU	(R2), R5
+loop_2:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_2
+found:
+	SUB	R8, R0, R0
+	MOVD	R0, (R9)
+	RET
+not_found:
+	MOVD	$-1, R0
+	MOVD	R0, (R9)
+	RET
+greater_8:
+	SUB	$9, R3, R11	// len(sep) - 9, offset of R0 for last 8 bytes
+	CMP	$16, R3
+	BHI	greater_16
+len_9_16:
+	MOVD.P	8(R2), R5	// R5 contains the first 8-byte of sep
+	SUB	$16, R3, R7	// len(sep) - 16, offset of R2 for last 8 bytes
+	MOVD	(R2)(R7), R6	// R6 contains the last 8-byte of sep
+loop_9_16:
+	// search the first 8 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R7
+	CMP	R5, R7
+	BNE	loop_9_16
+	MOVD	(R0)(R11), R7
+	CMP	R6, R7		// compare the last 8 bytes
+	BNE	loop_9_16
+	B	found
+greater_16:
+	CMP	$24, R3
+	BHI	len_25_32
+len_17_24:
+	LDP.P	16(R2), (R5, R6)	// R5 and R6 contain the first 16-byte of sep
+	SUB	$24, R3, R10		// len(sep) - 24
+	MOVD	(R2)(R10), R7		// R7 contains the last 8-byte of sep
+loop_17_24:
+	// search the first 16 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R10
+	CMP	R5, R10
+	BNE	loop_17_24
+	MOVD	7(R0), R10
+	CMP	R6, R10
+	BNE	loop_17_24
+	MOVD	(R0)(R11), R10
+	CMP	R7, R10		// compare the last 8 bytes
+	BNE	loop_17_24
+	B	found
+len_25_32:
+	LDP.P	16(R2), (R5, R6)
+	MOVD.P	8(R2), R7	// R5, R6 and R7 contain the first 24-byte of sep
+	SUB	$32, R3, R12	// len(sep) - 32
+	MOVD	(R2)(R12), R10	// R10 contains the last 8-byte of sep
+loop_25_32:
+	// search the first 24 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R12
+	CMP	R5, R12
+	BNE	loop_25_32
+	MOVD	7(R0), R12
+	CMP	R6, R12
+	BNE	loop_25_32
+	MOVD	15(R0), R12
+	CMP	R7, R12
+	BNE	loop_25_32
+	MOVD	(R0)(R11), R12
+	CMP	R10, R12	// compare the last 8 bytes
+	BNE	loop_25_32
+	B	found
--- a/src/internal/bytealg/index_generic.go
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+	panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+	panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	panic("unimplemented")
+}
--- a/src/internal/bytealg/index_native.go
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || s390x || ppc64le || ppc64
+
+package bytealg
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+//
+//go:noescape
+func Index(a, b []byte) int
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+//
+//go:noescape
+func IndexString(a, b string) int
--- a/src/internal/bytealg/index_ppc64x.go
+++ b/src/internal/bytealg/index_ppc64x.go
@@ -0,0 +1,26 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 16
+
+var SupportsPower9 = cpu.PPC64.IsPOWER9
+
+func init() {
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
--- a/src/internal/bytealg/index_ppc64x.s
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -0,0 +1,841 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an implementation based on the s390x
+// implementation.
+
+// Find a separator with 2 <= len <= 32 within a string.
+// Separators with lengths of 2, 3 or 4 are handled
+// specially.
+
+// This works on power8 and above. The loads and
+// compares are done in big endian order
+// since that allows the used of VCLZD, and allows
+// the same implementation to work on big and little
+// endian platforms with minimal conditional changes.
+
+// NOTE: There is a power9 implementation that
+// improves performance by 10-15% on little
+// endian for some of the benchmarks.
+// Unrolled index2to16 loop by 4 on ppc64le/power9
+// Work is still needed for a big endian
+// implementation on power9.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Needed to swap LXVD2X loads to the correct
+// byte order to work on POWER8.
+
+#ifdef GOARCH_ppc64
+DATA byteswap<>+0(SB)/8, $0x0001020304050607
+DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
+#else
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+#endif
+
+// Load bytes in big endian order. Address
+// alignment does not need checking.
+#define VLOADSWAP(base, index, vreg, vsreg) \
+	LXVD2X (base)(index), vsreg;  \
+	VPERM  vreg, vreg, SWAP, vreg
+
+GLOBL byteswap<>+0(SB), RODATA, $16
+
+TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// R3 = byte array pointer
+	// R4 = length
+	MOVD R6, R5             // R5 = separator pointer
+	MOVD R7, R6             // R6 = separator length
+
+#ifdef GOARCH_ppc64le
+	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+	CMP   R7, $1
+	BNE   power8
+	BR    indexbodyp9<>(SB)
+#endif
+power8:
+	BR indexbody<>(SB)
+
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = string
+	// R4 = length
+	// R5 = separator pointer
+	// R6 = separator length
+
+#ifdef GOARCH_ppc64le
+	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+	CMP   R7, $1
+	BNE   power8
+	BR    indexbodyp9<>(SB)
+
+#endif
+power8:
+	BR indexbody<>(SB)
+
+	// s: string we are searching
+	// sep: string to search for
+	// R3=&s[0], R4=len(s)
+	// R5=&sep[0], R6=len(sep)
+	// R14=&ret (index where sep found)
+	// R7=working addr of string
+	// R16=index value 16
+	// R17=index value 17
+	// R18=index value 18
+	// R19=index value 1
+	// R26=LASTBYTE of string
+	// R27=LASTSTR last start byte to compare with sep
+	// R8, R9 scratch
+	// V0=sep left justified zero fill
+	// CR4=sep length >= 16
+
+#define SEPMASK V17
+#define LASTBYTE R26
+#define LASTSTR R27
+#define ONES V20
+#define SWAP V21
+#define SWAP_ VS53
+TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
+	CMP      R6, R4                 // Compare lengths
+	BGT      notfound               // If sep len is > string, notfound
+	ADD      R4, R3, LASTBYTE       // find last byte addr
+	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
+	CMP      R6, $0                 // Check sep len
+	BEQ      notfound               // sep len 0 -- not found
+	MOVD     R3, R7                 // Copy of string addr
+	MOVD     $16, R16               // Index value 16
+	MOVD     $17, R17               // Index value 17
+	MOVD     $18, R18               // Index value 18
+	MOVD     $1, R19                // Index value 1
+	MOVD     $byteswap<>+00(SB), R8
+	VSPLTISB $0xFF, ONES            // splat all 1s
+	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
+
+	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
+	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
+	BGE    CR4, loadge16       // Load for len(sep) >= 16
+	SUB    R6, R16, R9         // 16-len of sep
+	SLD    $3, R9              // Set up for VSLO
+	MTVSRD R9, V9              // Set up for VSLO
+	VSLDOI $8, V9, V9, V9      // Set up for VSLO
+	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
+
+loadge16:
+	ANDCC $15, R5, R9 // Find byte offset of sep
+	ADD   R9, R6, R10 // Add sep len
+	CMP   R10, $16    // Check if sep len+offset > 16
+	BGT   sepcross16  // Sep crosses 16 byte boundary
+
+	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+	VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
+	SLD    $3, R9          // Set up shift count for VSLO
+	MTVSRD R9, V8         // Set up shift count for VSLO
+	VSLDOI $8, V8, V8, V8
+	VSLO   V0, V8, V0      // Shift by start byte
+
+	VAND V0, SEPMASK, V0 // Mask separator (< 16)
+	BR   index2plus
+
+sepcross16:
+	VLOADSWAP(R5, R0, V0, V0)  // Load 16 bytes @R5 into V0
+
+	VAND V0, SEPMASK, V0 // mask out separator
+	BLE  CR4, index2to16
+	BR   index17plus     // Handle sep > 16
+
+index2plus:
+	CMP      R6, $2       // Check length of sep
+	BNE      index3plus   // If not 2, check for 3
+	ADD      $16, R7, R9  // Check if next 16 bytes past last
+	CMP      R9, LASTBYTE // compare with last
+	BGE      index2to16   // 2 <= len(string) <= 16
+	MOVD     $0xff00, R21 // Mask for later
+	MTVSRD   R21, V25     // Move to Vreg
+	VSPLTH   $3, V25, V31 // Splat mask
+	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
+	VSPLTISB $0, V10      // Clear V10
+
+	// First case: 2 byte separator
+	// V1: 2 byte separator splatted
+	// V2: 16 bytes at addr
+	// V4: 16 bytes at addr+1
+	// Compare 2 byte separator at start
+	// and at start+1. Use VSEL to combine
+	// those results to find the first
+	// matching start byte, returning
+	// that value when found. Loop as
+	// long as len(string) > 16
+index2loop2:
+	VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
+
+index2loop:
+	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
+	VCMPEQUH V1, V2, V5        // Search for sep
+	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
+	VSEL     V6, V5, V31, V7   // merge even and odd indices
+	VCLZD    V7, V18           // find index of first match
+	MFVSRD   V18, R25          // get first value
+	CMP      R25, $64          // Found if < 64
+	BLT      foundR25          // Return byte index where found
+	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
+	MFVSRD   V18, R25          // get second value
+	CMP      R25, $64          // Found if < 64
+	ADD      $64, R25          // Update byte offset
+	BLT      foundR25          // Return value
+	ADD      $16, R7           // R7+=16 Update string pointer
+	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
+	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
+	BLT      index2loop2       // If < last, continue loop
+	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
+	BLT      index2to16        // If < 16 handle specially
+	VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
+	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
+	BR       index2loop
+
+index3plus:
+	CMP    R6, $3       // Check if sep == 3
+	BNE    index4plus   // If not check larger
+	ADD    $19, R7, R9  // Find bytes for use in this loop
+	CMP    R9, LASTBYTE // Compare against last byte
+	BGE    index2to16   // Remaining string 2<=len<=16
+	MOVD   $0xff00, R21 // Set up mask for upcoming loop
+	MTVSRD R21, V25     // Move mask to Vreg
+	VSPLTH $3, V25, V31 // Splat mask
+	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
+	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
+
+	// Loop to process 3 byte separator.
+	// string[0:16] is in V2
+	// string[2:18] is in V3
+	// sep[0:2] splatted in V1
+	// sec[3] splatted in v8
+	// Load vectors at string, string+1
+	// and string+2. Compare string, string+1
+	// against first 2 bytes of separator
+	// splatted, and string+2 against 3rd
+	// byte splatted. Merge the results with
+	// VSEL to find the first byte of a match.
+
+	// Special handling for last 16 bytes if the
+	// string fits in 16 byte multiple.
+index3loop2:
+	MOVD     $2, R21          // Set up index for 2
+	VSPLTISB $0, V10          // Clear V10
+	VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
+	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+	VLOADSWAP(R7, R0, V2, V2)  // Load with correct order
+	VSLDOI   $1, V2, V3, V4    // string[1:17]
+	VSLDOI   $2, V2, V3, V9    // string[2:18]
+	VCMPEQUH V1, V2, V5        // compare hw even indices
+	VCMPEQUH V1, V4, V6        // compare hw odd indices
+	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
+	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
+	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
+	VCLZD    V7, V18           // Find first nonzero indexes
+	MFVSRD   V18, R25          // Move 1st doubleword
+	CMP      R25, $64          // If < 64 found
+	BLT      foundR25          // Return matching index
+	VSLDOI   $8, V18, V18, V18 // Move value
+	MFVSRD   V18, R25          // Move 2nd doubleword
+	CMP      R25, $64          // If < 64 found
+	ADD      $64, R25          // Update byte index
+	BLT      foundR25          // Return matching index
+	ADD      $16, R7           // R7+=16 string ptr
+	ADD      $19, R7, R9       // Number of string bytes for loop
+	CMP      R9, LASTBYTE      // Compare against last byte of string
+	BLT      index3loop2       // If within, continue this loop
+	CMP      R7, LASTSTR       // Compare against last start byte
+	BLT      index2to16        // Process remainder
+	VSPLTISB $0, V3            // Special case for last 16 bytes
+	BR       index3loop        // Continue this loop
+
+	// Loop to process 4 byte separator
+	// string[0:16] in V2
+	// string[3:16] in V3
+	// sep[0:4] splatted in V1
+	// Set up vectors with strings at offsets
+	// 0, 1, 2, 3 and compare against the 4 byte
+	// separator also splatted. Use VSEL with the
+	// compare results to find the first byte where
+	// a separator match is found.
+index4plus:
+	CMP  R6, $4       // Check if 4 byte separator
+	BNE  index5plus   // If not next higher
+	ADD  $20, R7, R9  // Check string size to load
+	CMP  R9, LASTBYTE // Verify string length
+	BGE  index2to16   // If not large enough, process remaining
+	MOVD $2, R15      // Set up index
+
+	// Set up masks for use with VSEL
+	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
+	SLD    $24, R21
+	MTVSRD R21, V10
+	VSPLTW $1, V10, V29
+	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+	MOVD   $0xffff, R21
+	SLD    $16, R21
+	MTVSRD R21, V10
+	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
+	VSPLTW $0, V0, V1        // Splat 1st word of separator
+
+index4loop:
+	VLOADSWAP(R7, R0, V2, V2)   // Load 16 bytes @R7 into V2
+
+next4:
+	VSPLTISB $0, V10            // Clear
+	MOVD     $3, R9             // Number of bytes beyond 16
+	VLOADSWAP(R7, R9, V3, V3)   // Load 16 bytes @R7+3 into V3
+	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
+	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
+	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
+	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
+	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
+	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
+	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
+	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
+	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+	VSEL     V14, V13, V31, V7  // final merge
+	VCLZD    V7, V18            // Find first index for each half
+	MFVSRD   V18, R25           // Isolate value
+	CMP      R25, $64           // If < 64, found
+	BLT      foundR25           // Return found index
+	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
+	MFVSRD   V18, R25           // Isolate other value
+	CMP      R25, $64           // If < 64, found
+	ADD      $64, R25           // Update index for high doubleword
+	BLT      foundR25           // Return found index
+	ADD      $16, R7            // R7+=16 for next string
+	ADD      $20, R7, R9        // R+20 for all bytes to load
+	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
+	BLT      index4loop         // If not, continue loop
+	CMP      R7, LASTSTR        // Check remainder
+	BLE      index2to16         // Process remainder
+	BR       notfound           // Not found
+
+index5plus:
+	CMP R6, $16     // Check for sep > 16
+	BGT index17plus // Handle large sep
+
+	// Assumption is that the separator is smaller than the string at this point
+index2to16:
+	CMP R7, LASTSTR // Compare last start byte
+	BGT notfound    // last takes len(sep) into account
+
+	ADD $16, R7, R9    // Check for last byte of string
+	CMP R9, LASTBYTE
+	BGT index2to16tail
+
+	// At least 16 bytes of string left
+	// Mask the number of bytes in sep
+index2to16loop:
+	VLOADSWAP(R7, R0, V1, V1)  // Load 16 bytes @R7 into V1
+
+compare:
+	VAND       V1, SEPMASK, V2 // Mask out sep size
+	VCMPEQUBCC V0, V2, V3      // Compare masked string
+	BLT        CR6, found      // All equal
+	ADD        $1, R7          // Update ptr to next byte
+	CMP        R7, LASTSTR     // Still less than last start byte
+	BGT        notfound        // Not found
+	ADD        $16, R7, R9     // Verify remaining bytes
+	CMP        R9, LASTBYTE    // At least 16
+	BLT        index2to16loop  // Try again
+
+	// Less than 16 bytes remaining in string
+	// Separator >= 2
+index2to16tail:
+	ADD   R3, R4, R9     // End of string
+	SUB   R7, R9, R9     // Number of bytes left
+	ANDCC $15, R7, R10   // 16 byte offset
+	ADD   R10, R9, R11   // offset + len
+	CMP   R11, $16       // >= 16?
+	BLE   short          // Does not cross 16 bytes
+	VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
+	BR    index2to16next // Continue on
+
+short:
+	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
+	VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
+	SLD      $3, R10         // Set up shift
+	MTVSRD   R10, V8         // Set up shift
+	VSLDOI   $8, V8, V8, V8
+	VSLO     V1, V8, V1      // Shift by start byte
+	VSPLTISB $0, V25         // Clear for later use
+
+index2to16next:
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
+	BR         index2to16next  // Check the next partial string
+
+index17plus:
+	CMP      R6, $32      // Check if 17 < len(sep) <= 32
+	BGT      index33plus
+	SUB      $16, R6, R9  // Extra > 16
+	SLD      $56, R9, R10 // Shift to use in VSLO
+	MTVSRD   R10, V9      // Set up for VSLO
+	VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
+	VSLO     V1, V9, V1   // Shift left
+	VSPLTISB $0xff, V7    // Splat 1s
+	VSPLTISB $0, V27      // Splat 0
+
+index17to32loop:
+	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
+
+next17:
+	VLOADSWAP(R7, R9, V3, V3)  // Load 16 bytes @R7+R9 into V3
+	VSLO       V3, V9, V3      // Shift left
+	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
+	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
+	VAND       V4, V5, V6      // Check if both equal
+	VCMPEQUBCC V6, V7, V8      // All equal?
+	BLT        CR6, found      // Yes
+	ADD        $1, R7          // On to next byte
+	CMP        R7, LASTSTR     // Check if last start byte
+	BGT        notfound        // If too high, not found
+	BR         index17to32loop // Continue
+
+notfound:
+	MOVD $-1, R3   // Return -1 if not found
+	RET
+
+index33plus:
+	MOVD $0, (R0) // Case not implemented
+	RET           // Crash before return
+
+foundR25:
+	SRD  $3, R25   // Convert from bits to bytes
+	ADD  R25, R7   // Add to current string address
+	SUB  R3, R7    // Subtract from start of string
+	MOVD R7, R3    // Return byte where found
+	RET
+
+found:
+	SUB  R3, R7    // Return byte where found
+	MOVD R7, R3
+	RET
+
+TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
+	CMP      R6, R4                // Compare lengths
+	BGT      notfound              // If sep len is > string, notfound
+	ADD      R4, R3, LASTBYTE      // find last byte addr
+	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+	CMP      R6, $0                // Check sep len
+	BEQ      notfound              // sep len 0 -- not found
+	MOVD     R3, R7                // Copy of string addr
+#ifndef GOPPC64_power10
+	MOVD     $16, R16              // Index value 16
+	MOVD     $17, R17              // Index value 17
+	MOVD     $18, R18              // Index value 18
+	VSPLTISB $0xFF, ONES           // splat all 1s
+	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
+#else
+	SLD     $56, R6, R14       // Set up separator length for LXVLL
+#endif
+	MOVD   $1, R19             // Index value 1
+	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
+	BGE    CR4, loadge16       // Load for len(sep) >= 16
+#ifndef GOPPC64_power10
+	SUB    R6, R16, R9         // 16-len of sep
+	SLD    $3, R9              // Set up for VSLO
+	MTVSRD R9, V9              // Set up for VSLO
+	VSLDOI $8, V9, V9, V9      // Set up for VSLO
+	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
+#endif
+loadge16:
+	ANDCC $15, R5, R9 // Find byte offset of sep
+	ADD   R9, R6, R10 // Add sep len
+	CMP   R10, $16    // Check if sep len+offset > 16
+	BGT   sepcross16  // Sep crosses 16 byte boundary
+#ifdef GOPPC64_power10
+	LXVLL   R5, R14, V0     // Load separator
+#else
+	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
+	LXVB16X (R8)(R0), V0    // Load 16 bytes @R8 into V0
+	SLD     $3, R9          // Set up shift count for VSLO
+	MTVSRD  R9, V8          // Set up shift count for VSLO
+	VSLDOI  $8, V8, V8, V8
+	VSLO    V0, V8, V0      // Shift by start byte
+	VAND V0, SEPMASK, V0 // Mask separator (< 16)
+#endif
+	BR  index2plus
+sepcross16:
+#ifdef GOPPC64_power10
+	LXVLL   R5, R14, V0     // Load separator
+#else
+	LXVB16X (R5)(R0), V0    // Load 16 bytes @R5 into V0\
+	VAND V0, SEPMASK, V0 // mask out separator
+#endif
+	BLE  CR4, index2to16
+	BR   index17plus     // Handle sep > 16
+
+index2plus:
+	CMP      R6, $2       // Check length of sep
+	BNE      index3plus   // If not 2, check for 3
+	ADD      $16, R7, R9  // Check if next 16 bytes past last
+	CMP      R9, LASTBYTE // compare with last
+	BGE      index2to16   // 2 <= len(string) <= 16
+	MOVD     $0xff00, R21 // Mask for later
+	MTVSRD   R21, V25     // Move to Vreg
+	VSPLTH   $3, V25, V31 // Splat mask
+	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
+	VSPLTISB $0, V10      // Clear V10
+
+	// First case: 2 byte separator
+	// V1: 2 byte separator splatted
+	// V2: 16 bytes at addr
+	// V4: 16 bytes at addr+1
+	// Compare 2 byte separator at start
+	// and at start+1. Use VSEL to combine
+	// those results to find the first
+	// matching start byte, returning
+	// that value when found. Loop as
+	// long as len(string) > 16
+index2loop2:
+	LXVB16X (R7)(R19), V3  // Load 16 bytes @R7+1 into V3
+
+index2loop:
+	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7 into V2
+	VCMPEQUH V1, V2, V5      // Search for sep
+	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
+	VSEL     V6, V5, V31, V7 // merge even and odd indices
+	VCLZD    V7, V18         // find index of first match
+	MFVSRD   V18, R25        // get first value
+	CMP      R25, $64        // Found if < 64
+	BLT      foundR25        // Return byte index where found
+
+	MFVSRLD V18, R25        // get second value
+	CMP     R25, $64        // Found if < 64
+	ADD     $64, R25        // Update byte offset
+	BLT     foundR25        // Return value
+	ADD     $16, R7         // R7+=16 Update string pointer
+	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
+	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
+	BLT     index2loop2     // If < last, continue loop
+	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
+	BLT     index2to16      // If < 16 handle specially
+	LXVB16X (R7)(R0), V3    // Load 16 bytes @R7 into V3
+	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
+	BR      index2loop
+
+index3plus:
+	CMP    R6, $3       // Check if sep == 3
+	BNE    index4plus   // If not check larger
+	ADD    $19, R7, R9  // Find bytes for use in this loop
+	CMP    R9, LASTBYTE // Compare against last byte
+	BGE    index2to16   // Remaining string 2<=len<=16
+	MOVD   $0xff00, R21 // Set up mask for upcoming loop
+	MTVSRD R21, V25     // Move mask to Vreg
+	VSPLTH $3, V25, V31 // Splat mask
+	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
+	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
+
+	// Loop to process 3 byte separator.
+	// string[0:16] is in V2
+	// string[2:18] is in V3
+	// sep[0:2] splatted in V1
+	// sec[3] splatted in v8
+	// Load vectors at string, string+1
+	// and string+2. Compare string, string+1
+	// against first 2 bytes of separator
+	// splatted, and string+2 against 3rd
+	// byte splatted. Merge the results with
+	// VSEL to find the first byte of a match.
+
+	// Special handling for last 16 bytes if the
+	// string fits in 16 byte multiple.
+index3loop2:
+	MOVD     $2, R21          // Set up index for 2
+	VSPLTISB $0, V10          // Clear V10
+	LXVB16X  (R7)(R21), V3    // Load 16 bytes @R7+2 into V3
+	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7
+	VSLDOI   $1, V2, V3, V4  // string[1:17]
+	VSLDOI   $2, V2, V3, V9  // string[2:18]
+	VCMPEQUH V1, V2, V5      // compare hw even indices
+	VCMPEQUH V1, V4, V6      // compare hw odd indices
+	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
+	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
+	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
+	VCLZD    V7, V18         // Find first nonzero indexes
+	MFVSRD   V18, R25        // Move 1st doubleword
+	CMP      R25, $64        // If < 64 found
+	BLT      foundR25        // Return matching index
+
+	MFVSRLD  V18, R25     // Move 2nd doubleword
+	CMP      R25, $64     // If < 64 found
+	ADD      $64, R25     // Update byte index
+	BLT      foundR25     // Return matching index
+	ADD      $16, R7      // R7+=16 string ptr
+	ADD      $19, R7, R9  // Number of string bytes for loop
+	CMP      R9, LASTBYTE // Compare against last byte of string
+	BLT      index3loop2  // If within, continue this loop
+	CMP      R7, LASTSTR  // Compare against last start byte
+	BLT      index2to16   // Process remainder
+	VSPLTISB $0, V3       // Special case for last 16 bytes
+	BR       index3loop   // Continue this loop
+
+	// Loop to process 4 byte separator
+	// string[0:16] in V2
+	// string[3:16] in V3
+	// sep[0:4] splatted in V1
+	// Set up vectors with strings at offsets
+	// 0, 1, 2, 3 and compare against the 4 byte
+	// separator also splatted. Use VSEL with the
+	// compare results to find the first byte where
+	// a separator match is found.
+index4plus:
+	CMP  R6, $4       // Check if 4 byte separator
+	BNE  index5plus   // If not next higher
+	ADD  $20, R7, R9  // Check string size to load
+	CMP  R9, LASTBYTE // Verify string length
+	BGE  index2to16   // If not large enough, process remaining
+
+	// Set up masks for use with VSEL
+	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
+	SLD     $24, R21
+	MTVSRWS R21, V29
+
+	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+	MOVD    $0xffff, R21
+	SLD     $16, R21
+	MTVSRWS R21, V31
+
+	VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
+
+next4:
+	VSPLTISB $0, V10            // Clear
+	MOVD     $3, R9             // Number of bytes beyond 16
+	LXVB16X  (R7)(R9), V3       // Load 16 bytes @R7 into V3
+	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
+	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
+	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
+	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
+	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
+	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
+	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
+	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
+	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+	VSEL     V14, V13, V31, V7  // final merge
+	VCLZD    V7, V18            // Find first index for each half
+	MFVSRD   V18, R25           // Isolate value
+	CMP      R25, $64           // If < 64, found
+	BLT      foundR25           // Return found index
+
+	MFVSRLD V18, R25     // Isolate other value
+	CMP     R25, $64     // If < 64, found
+	ADD     $64, R25     // Update index for high doubleword
+	BLT     foundR25     // Return found index
+	ADD     $16, R7      // R7+=16 for next string
+	ADD     $20, R7, R9  // R+20 for all bytes to load
+	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
+	BLT     index4loop   // If not, continue loop
+	CMP     R7, LASTSTR  // Check remainder
+	BLE     index2to16   // Process remainder
+	BR      notfound     // Not found
+
+index5plus:
+	CMP R6, $16     // Check for sep > 16
+	BGT index17plus // Handle large sep
+
+	// Assumption is that the separator is smaller than the string at this point
+index2to16:
+	CMP R7, LASTSTR // Compare last start byte
+	BGT notfound    // last takes len(sep) into account
+
+	ADD $19, R7, R9    // To check 4 indices per iteration, need at least 16+3 bytes
+	CMP R9, LASTBYTE
+	// At least 16 bytes of string left
+	// Mask the number of bytes in sep
+	VSPLTISB $0, V10            // Clear
+	BGT index2to16tail
+
+#ifdef GOPPC64_power10
+	ADD     $3,R7, R17          // Base+3
+	ADD     $2,R7, R8           // Base+2
+	ADD     $1,R7, R10          // Base+1
+#else
+	MOVD	$3, R17             // Number of bytes beyond 16
+#endif
+	PCALIGN  $16
+
+index2to16loop:
+
+#ifdef GOPPC64_power10
+	LXVLL  R7, R14, V8          // Load next 16 bytes of string  from Base
+	LXVLL  R10, R14, V9         // Load next 16 bytes of string from Base+1
+	LXVLL  R8, R14, V11         // Load next 16 bytes of string from Base+2
+	LXVLL  R17,R14, V12         // Load next 16 bytes of string  from Base+3
+#else
+	LXVB16X  (R7)(R0), V1       // Load next 16 bytes of string into V1 from R7
+	LXVB16X  (R7)(R17), V5      // Load next 16 bytes of string into V5 from R7+3
+
+	VSLDOI   $13, V5, V10, V2  // Shift left last 3 bytes
+	VSLDOI  $1, V1, V2, V3     // V3=(V1:V2)<<1
+	VSLDOI  $2, V1, V2, V4     // V4=(V1:V2)<<2
+	VAND    V1, SEPMASK, V8    // Mask out sep size 0th index
+	VAND    V3, SEPMASK, V9    // Mask out sep size 1st index
+	VAND    V4, SEPMASK, V11   // Mask out sep size 2nd index
+	VAND    V5, SEPMASK, V12   // Mask out sep size 3rd index
+#endif
+	VCMPEQUBCC      V0, V8, V8 // compare masked string
+	BLT     CR6, found         // All equal while comparing 0th index
+	VCMPEQUBCC      V0, V9, V9 // compare masked string
+	BLT     CR6, found2        // All equal while comparing 1st index
+	VCMPEQUBCC      V0, V11, V11    // compare masked string
+	BLT     CR6, found3        // All equal while comparing 2nd index
+	VCMPEQUBCC      V0, V12, V12    // compare masked string
+	BLT     CR6, found4        // All equal while comparing 3rd index
+
+	ADD        $4, R7          // Update ptr to next 4 bytes
+#ifdef GOPPC64_power10
+	ADD        $4, R17         // Update ptr to next 4 bytes
+	ADD        $4, R8          // Update ptr to next 4 bytes
+	ADD        $4, R10         // Update ptr to next 4 bytes
+#endif
+	CMP        R7, LASTSTR     // Still less than last start byte
+	BGT        notfound        // Not found
+	ADD        $19, R7, R9     // Verify remaining bytes
+	CMP        R9, LASTBYTE    // length of string at least 19
+	BLE        index2to16loop  // Try again, else do post processing and jump to index2to16next
+	PCALIGN    $32
+	// <19 bytes left, post process the remaining string
+index2to16tail:
+#ifdef GOPPC64_power10
+index2to16next_p10:
+	LXVLL   R7,R14, V1       // Load 16 bytes @R7 into V1
+	VCMPEQUBCC V1, V0, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BLE        index2to16next_p10        // If at end, then not found
+	BR         notfound  // go to remainder loop
+#else
+	ADD     R3, R4, R9         // End of string
+	SUB     R7, R9, R9         // Number of bytes left
+	ANDCC   $15, R7, R10       // 16 byte offset
+	ADD     R10, R9, R11       // offset + len
+	CMP     R11, $16           // >= 16?
+	BLE     short              // Does not cross 16 bytes
+	LXVB16X (R7)(R0), V1       // Load 16 bytes @R7 into V1
+	CMP     R9, $16            // Post-processing of unrolled loop
+	BLE     index2to16next     // continue to index2to16next if <= 16 bytes
+	SUB     R16, R9, R10       // R9 should be 18 or 17 hence R10 is 1 or 2
+	LXVB16X (R7)(R10), V9
+	CMP     R10, $1            // string length is 17, compare 1 more byte
+	BNE     extra2             // string length is 18, compare 2 more bytes
+	VSLDOI  $15, V9, V10, V25
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
+	BR         index2to16next  // go to remainder loop
+extra2:
+	VSLDOI  $14, V9, V10, V25
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VOR        V1, V1, V4      // save remaining string
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
+	BR         index2to16next  // Check the remaining partial string in index2to16next
+
+short:
+	RLDICR   $0, R7, $59, R9   // Adjust addr to 16 byte container
+	LXVB16X  (R9)(R0), V1      // Load 16 bytes @R9 into V1
+	SLD      $3, R10           // Set up shift
+	MTVSRD   R10, V8           // Set up shift
+	VSLDOI   $8, V8, V8, V8
+	VSLO     V1, V8, V1        // Shift by start byte
+	PCALIGN  $16
+index2to16next:
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V10, V1 // Shift string left by 1 byte
+	BR         index2to16next  // Check the next partial string
+#endif // Tail processing if GOPPC64!=power10
+
+index17plus:
+	CMP      R6, $32       // Check if 17 < len(sep) <= 32
+	BGT      index33plus
+	SUB      $16, R6, R9   // Extra > 16
+	SLD      $56, R9, R10  // Shift to use in VSLO
+	MTVSRD   R10, V9       // Set up for VSLO
+	LXVB16X  (R5)(R9), V1  // Load 16 bytes @R5+R9 into V1
+	VSLO     V1, V9, V1    // Shift left
+	VSPLTISB $0xff, V7     // Splat 1s
+	VSPLTISB $0, V27       // Splat 0
+
+index17to32loop:
+	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
+
+next17:
+	LXVB16X    (R7)(R9), V3    // Load 16 bytes @R7+R9 into V3
+	VSLO       V3, V9, V3      // Shift left
+	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
+	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
+	VAND       V4, V5, V6      // Check if both equal
+	VCMPEQUBCC V6, V7, V8      // All equal?
+	BLT        CR6, found      // Yes
+	ADD        $1, R7          // On to next byte
+	CMP        R7, LASTSTR     // Check if last start byte
+	BGT        notfound        // If too high, not found
+	BR         index17to32loop // Continue
+
+notfound:
+	MOVD $-1, R3   // Return -1 if not found
+	RET
+
+index33plus:
+	MOVD $0, (R0) // Case not implemented
+	RET           // Crash before return
+
+foundR25:
+	SRD  $3, R25   // Convert from bits to bytes
+	ADD  R25, R7   // Add to current string address
+	SUB  R3, R7    // Subtract from start of string
+	MOVD R7, R3    // Return byte where found
+	RET
+found4:
+	ADD $1, R7     // found from unrolled loop at index 3
+found3:
+	ADD $1, R7     // found from unrolled loop at index 2
+found2:
+	ADD $1, R7     // found from unrolled loop at index 1
+found:                 // found at index 0
+	SUB  R3, R7    // Return byte where found
+	MOVD R7, R3
+	RET
--- a/src/internal/bytealg/index_s390x.go
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	// Note: we're kind of lucky that this flag is available at this point.
+	// The runtime sets HasVX when processing auxv records, and that happens
+	// to happen *before* running the init functions of packages that
+	// the runtime depends on.
+	// TODO: it would really be nicer for internal/cpu to figure out this
+	// flag by itself. Then we wouldn't need to depend on quirks of
+	// early startup initialization order.
+	if cpu.S390X.HasVX {
+		MaxLen = 64
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
--- a/src/internal/bytealg/index_s390x.s
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
--- a/src/internal/bytealg/indexbyte_386.s
+++ b/src/internal/bytealg/indexbyte_386.s
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVL	b_base+0(FP), SI
+	MOVL	b_len+4(FP), CX
+	MOVB	c+12(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+16(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVL	s_base+0(FP), SI
+	MOVL	s_len+4(FP), CX
+	MOVB	c+8(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+12(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+12(FP)
+	RET
--- a/src/internal/bytealg/indexbyte_amd64.s
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -0,0 +1,154 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !plan9
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT	·IndexByte(SB), NOSPLIT, $0-40
+	MOVQ b_base+0(FP), SI
+	MOVQ b_len+8(FP), BX
+	MOVB c+24(FP), AL
+	LEAQ ret+32(FP), R8
+	JMP  indexbytebody<>(SB)
+
+TEXT	·IndexByteString(SB), NOSPLIT, $0-32
+	MOVQ s_base+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+16(FP), AL
+	LEAQ ret+24(FP), R8
+	JMP  indexbytebody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+TEXT	indexbytebody<>(SB), NOSPLIT, $0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+	PCALIGN $16
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Find first set bit, if any.
+	BSFL	DX, DX
+	JNZ	ssesuccess
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JB	sseloop
+
+	// Search the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched, but that's ok.
+	MOVQ	AX, DI
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	BSFL	DX, DX
+	JNZ	ssesuccess
+
+failure:
+	MOVQ $-1, (R8)
+	RET
+
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+	SUBQ SI, DI	// Compute offset of chunk within data.
+	ADDQ DX, DI	// Add offset of byte within chunk.
+	MOVQ DI, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	failure
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	MOVOU	(SI), X1 // Load data
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	CMPL	DX, BX
+	JAE	failure	// Match is past end of data.
+	MOVQ	DX, (R8)
+	RET
+
+endofpage:
+	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	MOVL	BX, CX
+	SHLL	CX, DX
+	SHRL	$16, DX	// Shift desired bits down to bottom of register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+#ifndef hasAVX2
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+#endif
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+
+	PCALIGN $32
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLT avx2_loop
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	VZEROUPPER
+	MOVQ $-1, (R8)
+	RET
+
+avx2success:
+	VPMOVMSKB Y3, DX
+	BSFL DX, DX
+	SUBQ SI, DI
+	ADDQ DI, DX
+	MOVQ DX, (R8)
+	VZEROUPPER
+	RET
--- a/src/internal/bytealg/indexbyte_arm.s
+++ b/src/internal/bytealg/indexbyte_arm.s
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2	// byte to find
+	MOVW	$ret+16(FP), R5
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2	// byte to find
+	MOVW	$ret+12(FP), R5
+	B	indexbytebody<>(SB)
+
+// input:
+//  R0: data
+//  R1: data length
+//  R2: byte to find
+//  R5: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0-0
+	MOVW	R0, R4		// store base for later
+	ADD	R0, R1		// end
+
+loop:
+	CMP	R0, R1
+	B.EQ	notfound
+	MOVBU.P	1(R0), R3
+	CMP	R2, R3
+	B.NE	loop
+
+	SUB	$1, R0		// R0 will be one beyond the position we want
+	SUB	R4, R0		// remove base
+	MOVW	R0, (R5)
+	RET
+
+notfound:
+	MOVW	$-1, R0
+	MOVW	R0, (R5)
+	RET
--- a/src/internal/bytealg/indexbyte_arm64.s
+++ b/src/internal/bytealg/indexbyte_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	indexbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R1: byte to search
+//   R2: data len
+//   R8: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+	// Core algorithm:
+	// For each 32-byte chunk we calculate a 64-bit syndrome value,
+	// with two bits per byte. For each tuple, bit 0 is set if the
+	// relevant byte matched the requested character and bit 1 is
+	// not used (faster than using a 32bit syndrome). Since the bits
+	// in the syndrome reflect exactly the order in which things occur
+	// in the original string, counting trailing zeros allows to
+	// identify exactly which byte has matched.
+
+	CBZ	R2, fail
+	MOVD	R0, R11
+	// Magic constant 0x40100401 allows us to identify
+	// which lane matches the requested byte.
+	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+	MOVD	$0x40100401, R5
+	VMOV	R1, V0.B16
+	// Work with aligned 32-byte chunks
+	BIC	$0x1f, R0, R3
+	VMOV	R5, V5.S4
+	ANDS	$0x1f, R0, R9
+	AND	$0x1f, R2, R10
+	BEQ	loop
+
+	// Input string is not 32-byte aligned. We calculate the
+	// syndrome value for the aligned 32 bytes block containing
+	// the first bytes and mask off the irrelevant part.
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUB	$0x20, R9, R4
+	ADDS	R4, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
+	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
+	VMOV	V6.D[0], R6
+	// Clear the irrelevant lower bits
+	LSL	$1, R9, R4
+	LSR	R4, R6, R6
+	LSL	R4, R6, R6
+	// The first block can also be the last
+	BLS	masklast
+	// Have we found something already?
+	CBNZ	R6, tail
+
+loop:
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUBS	$0x20, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// If we're out of data we finish regardless of the result
+	BLS	end
+	// Use a fast check for the termination condition
+	VORR	V4.B16, V3.B16, V6.B16
+	VADDP	V6.D2, V6.D2, V6.D2
+	VMOV	V6.D[0], R6
+	// We're not out of data, loop if we haven't found the character
+	CBZ	R6, loop
+
+end:
+	// Termination condition found, let's calculate the syndrome value
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16
+	VADDP	V6.B16, V6.B16, V6.B16
+	VMOV	V6.D[0], R6
+	// Only do the clear for the last possible block with less than 32 bytes
+	// Condition flags come from SUBS in the loop
+	BHS	tail
+
+masklast:
+	// Clear the irrelevant upper bits
+	ADD	R9, R10, R4
+	AND	$0x1f, R4, R4
+	SUB	$0x20, R4, R4
+	NEG	R4<<1, R4
+	LSL	R4, R6, R6
+	LSR	R4, R6, R6
+
+tail:
+	// Check that we have found a character
+	CBZ	R6, fail
+	// Count the trailing zeros using bit reversing
+	RBIT	R6, R6
+	// Compensate the last post-increment
+	SUB	$0x20, R3, R3
+	// And count the leading zeros
+	CLZ	R6, R6
+	// R6 is twice the offset into the fragment
+	ADD	R6>>1, R3, R0
+	// Compute the offset result
+	SUB	R11, R0, R0
+	MOVD	R0, (R8)
+	RET
+
+fail:
+	MOVD	$-1, R0
+	MOVD	R0, (R8)
+	RET
--- a/src/internal/bytealg/indexbyte_generic.go
+++ b/src/internal/bytealg/indexbyte_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Avoid IndexByte and IndexByteString on Plan 9 because it uses
+// SSE instructions on x86 machines, and those are classified as
+// floating point instructions, which are illegal in a note handler.
+
+//go:build !386 && (!amd64 || plan9) && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !mips64 && !mips64le && !riscv64 && !wasm
+
+package bytealg
+
+func IndexByte(b []byte, c byte) int {
+	for i, x := range b {
+		if x == c {
+			return i
+		}
+	}
+	return -1
+}
+
+func IndexByteString(s string, c byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
--- a/src/internal/bytealg/indexbyte_loong64.s
+++ b/src/internal/bytealg/indexbyte_loong64.s
@@ -0,0 +1,52 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+	// R4 = b_base
+	// R5 = b_len
+	// R6 = b_cap (unused)
+	// R7 = byte to find
+	AND	$0xff, R7
+	MOVV	R4, R6		// store base for later
+	ADDV	R4, R5		// end
+	ADDV	$-1, R4
+
+	PCALIGN	$16
+loop:
+	ADDV	$1, R4
+	BEQ	R4, R5, notfound
+	MOVBU	(R4), R8
+	BNE	R7, R8, loop
+
+	SUBV	R6, R4		// remove base
+	RET
+
+notfound:
+	MOVV	$-1, R4
+	RET
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+	// R4 = s_base
+	// R5 = s_len
+	// R6 = byte to find
+	MOVV	R4, R7		// store base for later
+	ADDV	R4, R5		// end
+	ADDV	$-1, R4
+
+	PCALIGN	$16
+loop:
+	ADDV	$1, R4
+	BEQ	R4, R5, notfound
+	MOVBU	(R4), R8
+	BNE	R6, R8, loop
+
+	SUBV	R7, R4		// remove base
+	RET
+
+notfound:
+	MOVV	$-1, R4
+	RET
--- a/src/internal/bytealg/indexbyte_mips64x.s
+++ b/src/internal/bytealg/indexbyte_mips64x.s
@@ -0,0 +1,54 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVV	b_base+0(FP), R1
+	MOVV	b_len+8(FP), R2
+	MOVBU	c+24(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+32(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVV	s_base+0(FP), R1
+	MOVV	s_len+8(FP), R2
+	MOVBU	c+16(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+24(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+24(FP)
+	RET
--- a/src/internal/bytealg/indexbyte_mipsx.s
+++ b/src/internal/bytealg/indexbyte_mipsx.s
@@ -0,0 +1,52 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R1
+	MOVW	b_len+4(FP), R2
+	MOVBU	c+12(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// R1 will be one beyond the position we want so remove (base+1)
+	MOVW	R1, ret+16(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R1
+	MOVW	s_len+4(FP), R2
+	MOVBU	c+8(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// remove (base+1)
+	MOVW	R1, ret+12(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+12(FP)
+	RET
--- a/src/internal/bytealg/indexbyte_native.go
+++ b/src/internal/bytealg/indexbyte_native.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || (amd64 && !plan9) || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || mips64 || mips64le || riscv64 || wasm
+
+package bytealg
+
+//go:noescape
+func IndexByte(b []byte, c byte) int
+
+//go:noescape
+func IndexByteString(s string, c byte) int
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,314 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = byte array pointer
+	// R4 = length
+	MOVD	R6, R5		// R5 = byte
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+	// R3 = string
+	// R4 = length
+	// R5 = byte
+	BR	indexbytebody<>(SB)
+
+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX	MOVDBR
+#define _LWBEX	MOVWBR
+#define _LHBEX	MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX	MOVD
+#define _LWBEX	MOVW
+#define _LHBEX	MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
+// R3 = addr of string
+// R4 = len of string
+// R5 = byte to find
+// On exit:
+// R3 = return value
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPU	R4,$32
+
+#ifndef GOPPC64_power9
+	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
+	MOVD	$indexbytevbperm<>+00(SB),R16
+	LXVD2X	(R16),V0	// Set up swap string
+#endif
+
+	MTVRD	R5,V1
+	VSPLTB	$7,V1,V1	// Replicate byte across V1
+
+	BLT	cmp16		// Jump to the small string case if it's <32 bytes.
+
+	CMP	R4,$64,CR1
+	MOVD	$16,R11
+	MOVD	R3,R8
+	BLT	CR1,cmp32	// Special case for length 32 - 63
+	MOVD	$32,R12
+	MOVD	$48,R6
+
+	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
+	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
+	ANDCC	$63,R4		// (len &= 63) cmp 0.
+
+	PCALIGN	$16
+loop64:
+	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// Match found at R8, jump out
+
+	LXVD2X	(R11)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out
+
+	LXVD2X	(R12)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out
+
+	LXVD2X	(R6)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out
+
+	ADD	$64,R8
+	CMPU	R8,R9,CR1
+	BNE	CR1,loop64	// R8 != &s[len &^ 63]?
+
+	PCALIGN	$32
+	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.
+
+	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
+	CMP	R4,$16,CR1
+	BGE	cmp32
+
+	ADD	R8,R4,R9
+	ADD	$-16,R9
+	BLE	CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16:	// Tail length 17 - 32
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0
+
+cmp64_tail_gt0:	// Tail length 1 - 16
+	MOVD	R9,R8
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0
+
+	BR	notfound
+
+cmp32:	// Length 32 - 63
+
+	// Bytes 0 - 15
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0
+
+	// Bytes 16 - 31
+	LXVD2X	(R8)(R11),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out
+
+	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+	CMP	R4,$48
+
+	ADD	R4,R8,R9		// Compute &s[len(s)-16]
+	ADD	$32,R8,R8
+	ADD	$-16,R9,R9
+	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8
+
+	// Bytes 33 - 47
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out
+
+	BLE	notfound
+
+	// Bytes 48 - 63
+	MOVD	R9,R8			// R9 holds the final check.
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
+
+	BR	notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
+foundat3:
+	SUB	R3,R8,R3
+	ADD	$48+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
+foundat2:
+	SUB	R3,R8,R3
+	ADD	$32+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
+foundat1:
+	SUB	R3,R8,R3
+	ADD	$16+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
+foundat0:
+	SUB	R3,R8,R3
+	ADD	$0+ADJUST_FOR_CNTLZW,R3
+vfound:
+	// Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+	VBPERMQ	V6,V0,V6
+	MFVRD	V6,R4
+	CNTLZW	R4,R4
+#else
+#ifdef GOARCH_ppc64le
+	// Put the value back into LE ordering by swapping doublewords.
+	XXPERMDI	V6,V6,$2,V6
+#endif
+	_VCZBEBB	V6,R4
+#endif
+	ADD	R3,R4,R3
+	RET
+
+cmp16:	// Length 16 - 31
+	CMPU	R4,$16
+	ADD	R4,R3,R9
+	BLT	cmp8
+
+	ADD	$-16,R9,R9		// &s[len(s)-16]
+
+	// Bytes 0 - 15
+	LXVD2X	(R0)(R3),V2
+	VCMPEQUBCC	V2,V1,V6
+	MOVD	R3,R8
+	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out
+
+	BEQ	notfound
+
+	// Bytes 16 - 30
+	MOVD	R9,R8			// R9 holds the final check.
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
+
+	BR	notfound
+
+
+cmp8:	// Length 8 - 15
+#ifdef GOPPC64_power10
+	// Load all the bytes into a single VSR in BE order.
+	SLD	$56,R4,R5
+	LXVLL	R3,R5,V2
+	// Compare and count the number which don't match.
+	VCMPEQUB	V2,V1,V6
+	VCLZLSBB	V6,R3
+	// If count is the number of bytes, or more. No matches are found.
+	CMPU	R3,R4
+	MOVD	$-1,R5
+	// Otherwise, the count is the index of the first match.
+	ISEL	CR0LT,R3,R5,R3
+	RET
+#else
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	RLDIMI	$16,R5,$32,R5
+	RLDIMI	$32,R5,$0,R5
+	CMPU	R4,$8
+	BLT	cmp4
+	MOVD	$-8,R11
+	ADD	$-8,R4,R4
+
+	_LDBEX	(R0)(R3),R10
+	_LDBEX	(R11)(R9),R11
+	CMPB	R10,R5,R10
+	CMPB	R11,R5,R11
+	CMPU	R10,$0
+	CMPU	R11,$0,CR1
+	CNTLZD	R10,R10
+	CNTLZD	R11,R11
+	SRD	$3,R10,R3
+	SRD	$3,R11,R11
+	BNE	found
+
+	ADD	R4,R11,R4
+	MOVD	$-1,R3
+	ISEL	CR1EQ,R3,R4,R3
+	RET
+
+cmp4:	// Length 4 - 7
+	CMPU	R4,$4
+	BLT	cmp2
+	MOVD	$-4,R11
+	ADD	$-4,R4,R4
+
+	_LWBEX	(R0)(R3),R10
+	_LWBEX	(R11)(R9),R11
+	CMPB	R10,R5,R10
+	CMPB	R11,R5,R11
+	CNTLZW	R10,R10
+	CNTLZW	R11,R11
+	CMPU	R10,$32
+	CMPU	R11,$32,CR1
+	SRD	$3,R10,R3
+	SRD	$3,R11,R11
+	BNE	found
+
+	ADD	R4,R11,R4
+	MOVD	$-1,R3
+	ISEL	CR1EQ,R3,R4,R3
+	RET
+
+cmp2:	// Length 2 - 3
+	CMPU	R4,$2
+	BLT	cmp1
+
+	_LHBEX	(R0)(R3),R10
+	CMPB	R10,R5,R10
+	SLDCC	$48,R10,R10
+	CNTLZD	R10,R10
+	SRD	$3,R10,R3
+	BNE	found
+
+cmp1:	// Length 1
+	MOVD	$-1,R3
+	ANDCC	$1,R4,R31
+	BEQ	found
+
+	MOVBZ	-1(R9),R10
+	CMPB	R10,R5,R10
+	ANDCC	$1,R10
+	ADD	$-1,R4
+	ISEL	CR0EQ,R3,R4,R3
+
+found:
+	RET
+#endif
+
+notfound:
+	MOVD $-1,R3
+	RET
+
--- a/src/internal/bytealg/indexbyte_riscv64.s
+++ b/src/internal/bytealg/indexbyte_riscv64.s
@@ -0,0 +1,51 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = b_cap (unused)
+	// X13 = byte to find
+	AND	$0xff, X13
+	MOV	X10, X12		// store base for later
+	ADD	X10, X11		// end
+	SUB	$1, X10
+
+loop:
+	ADD	$1, X10
+	BEQ	X10, X11, notfound
+	MOVBU	(X10), X14
+	BNE	X13, X14, loop
+
+	SUB	X12, X10		// remove base
+	RET
+
+notfound:
+	MOV	$-1, X10
+	RET
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = byte to find
+	AND	$0xff, X12
+	MOV	X10, X13		// store base for later
+	ADD	X10, X11		// end
+	SUB	$1, X10
+
+loop:
+	ADD	$1, X10
+	BEQ	X10, X11, notfound
+	MOVBU	(X10), X14
+	BNE	X12, X14, loop
+
+	SUB	X13, X10		// remove base
+	RET
+
+notfound:
+	MOV	$-1, X10
+	RET
--- a/src/internal/bytealg/indexbyte_s390x.s
+++ b/src/internal/bytealg/indexbyte_s390x.s
@@ -0,0 +1,108 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	b_base+0(FP), R3// b_base => R3
+	MOVD	b_len+8(FP), R4 // b_len => R4
+	MOVBZ	c+24(FP), R5    // c => R5
+	MOVD	$ret+32(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s_base+0(FP), R3// s_base => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+16(FP), R5    // c => R5
+	MOVD	$ret+24(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBEQ	R4, $0, notfound
+	MOVD	R3, R6          // store base for later
+	ADD	R3, R4, R8      // the address after the end of the string
+	//if the length is small, use loop; otherwise, use vector or srst search
+	CMPBGE	R4, $16, large
+
+residual:
+	CMPBEQ	R3, R8, notfound
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, residual
+
+found:
+	SUB	R6, R3
+	SUB	$1, R3
+	MOVD	R3, 0(R2)
+	RET
+
+notfound:
+	MOVD	$-1, 0(R2)
+	RET
+
+large:
+	MOVBZ	internal∕cpu·S390X+const_offsetS390xHasVX(SB), R1
+	CMPBNE	R1, $0, vectorimpl
+
+srstimpl:                       // no vector facility
+	MOVBZ	R5, R0          // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+	WORD	$0xB25E0083     // srst %r8, %r3 (search the range [R3, R8))
+	BVS	srstloop        // interrupted - continue
+	BGT	notfoundr0
+foundr0:
+	XOR	R0, R0          // reset R0
+	SUB	R6, R8          // remove base
+	MOVD	R8, 0(R2)
+	RET
+notfoundr0:
+	XOR	R0, R0          // reset R0
+	MOVD	$-1, 0(R2)
+	RET
+
+vectorimpl:
+	//if the address is not 16byte aligned, use loop for the header
+	MOVD	R3, R8
+	AND	$15, R8
+	CMPBGT	R8, $0, notaligned
+
+aligned:
+	ADD	R6, R4, R8
+	MOVD	R8, R7
+	AND	$-16, R7
+	// replicate c across V17
+	VLVGB	$0, R5, V19
+	VREPB	$0, V19, V17
+
+vectorloop:
+	CMPBGE	R3, R7, residual
+	VL	0(R3), V16    // load string to be searched into V16
+	ADD	$16, R3
+	VFEEBS	V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+	BVS	vectorloop
+
+	// when vector search found c in the string
+	VLGVB	$7, V18, R7   // load 7th element of V18 containing index into R7
+	SUB	$16, R3
+	SUB	R6, R3
+	ADD	R3, R7
+	MOVD	R7, 0(R2)
+	RET
+
+notaligned:
+	MOVD	R3, R8
+	AND	$-16, R8
+	ADD     $16, R8
+notalignedloop:
+	CMPBEQ	R3, R8, aligned
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, notalignedloop
+	BR	found
--- a/src/internal/bytealg/indexbyte_wasm.s
+++ b/src/internal/bytealg/indexbyte_wasm.s
@@ -0,0 +1,195 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+	I64Load b_base+0(FP)
+	I32WrapI64
+	I32Load8U c+24(FP)
+	I64Load b_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32U
+	Set R0
+
+	Get SP
+	I64Const $-1
+	Get R0
+	I64Load b_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+32(FP)
+
+	RET
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+	Get SP
+	I64Load s_base+0(FP)
+	I32WrapI64
+	I32Load8U c+16(FP)
+	I64Load s_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32U
+	Set R0
+
+	I64Const $-1
+	Get R0
+	I64Load s_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+24(FP)
+
+	RET
+
+// initially compiled with emscripten and then modified over time.
+// params:
+//   R0: s
+//   R1: c
+//   R2: len
+// ret: index
+TEXT memchr<>(SB), NOSPLIT, $0
+	Get R1
+	Set R4
+	Block
+		Block
+			Get R2
+			I32Const $0
+			I32Ne
+			Tee R3
+			Get R0
+			I32Const $3
+			I32And
+			I32Const $0
+			I32Ne
+			I32And
+			If
+				Loop
+					Get R0
+					I32Load8U $0
+					Get R1
+					I32Eq
+					BrIf $2
+					Get R2
+					I32Const $-1
+					I32Add
+					Tee R2
+					I32Const $0
+					I32Ne
+					Tee R3
+					Get R0
+					I32Const $1
+					I32Add
+					Tee R0
+					I32Const $3
+					I32And
+					I32Const $0
+					I32Ne
+					I32And
+					BrIf $0
+				End
+			End
+			Get R3
+			BrIf $0
+			I32Const $0
+			Set R1
+			Br $1
+		End
+		Get R0
+		I32Load8U $0
+		Get R4
+		Tee R3
+		I32Eq
+		If
+			Get R2
+			Set R1
+		Else
+			Get R4
+			I32Const $16843009
+			I32Mul
+			Set R4
+			Block
+				Block
+					Get R2
+					I32Const $3
+					I32GtU
+					If
+						Get R2
+						Set R1
+						Loop
+							Get R0
+							I32Load $0
+							Get R4
+							I32Xor
+							Tee R2
+							I32Const $-2139062144
+							I32And
+							I32Const $-2139062144
+							I32Xor
+							Get R2
+							I32Const $-16843009
+							I32Add
+							I32And
+							I32Eqz
+							If
+								Get R0
+								I32Const $4
+								I32Add
+								Set R0
+								Get R1
+								I32Const $-4
+								I32Add
+								Tee R1
+								I32Const $3
+								I32GtU
+								BrIf $1
+								Br $3
+							End
+						End
+					Else
+						Get R2
+						Set R1
+						Br $1
+					End
+					Br $1
+				End
+				Get R1
+				I32Eqz
+				If
+					I32Const $0
+					Set R1
+					Br $3
+				End
+			End
+			Loop
+				Get R0
+				I32Load8U $0
+				Get R3
+				I32Eq
+				BrIf $2
+				Get R0
+				I32Const $1
+				I32Add
+				Set R0
+				Get R1
+				I32Const $-1
+				I32Add
+				Tee R1
+				BrIf $0
+				I32Const $0
+				Set R1
+			End
+		End
+	End
+	Get R0
+	I32Const $0
+	Get R1
+	Select
+	Return
--- a/src/internal/bytealg/lastindexbyte_generic.go
+++ b/src/internal/bytealg/lastindexbyte_generic.go
@@ -0,0 +1,23 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+func LastIndexByte(s []byte, c byte) int {
+	for i := len(s) - 1; i >= 0; i-- {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
+
+func LastIndexByteString(s string, c byte) int {
+	for i := len(s) - 1; i >= 0; i-- {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
--- a/src/internal/byteorder/byteorder.go
+++ b/src/internal/byteorder/byteorder.go
@@ -0,0 +1,149 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package byteorder provides functions for decoding and encoding
+// little and big endian integer types from/to byte slices.
+package byteorder
+
+func LeUint16(b []byte) uint16 {
+	_ = b[1] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint16(b[0]) | uint16(b[1])<<8
+}
+
+func LePutUint16(b []byte, v uint16) {
+	_ = b[1] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v)
+	b[1] = byte(v >> 8)
+}
+
+func LeAppendUint16(b []byte, v uint16) []byte {
+	return append(b,
+		byte(v),
+		byte(v>>8),
+	)
+}
+
+func LeUint32(b []byte) uint32 {
+	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func LePutUint32(b []byte, v uint32) {
+	_ = b[3] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v)
+	b[1] = byte(v >> 8)
+	b[2] = byte(v >> 16)
+	b[3] = byte(v >> 24)
+}
+
+func LeAppendUint32(b []byte, v uint32) []byte {
+	return append(b,
+		byte(v),
+		byte(v>>8),
+		byte(v>>16),
+		byte(v>>24),
+	)
+}
+
+func LeUint64(b []byte) uint64 {
+	_ = b[7] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func LePutUint64(b []byte, v uint64) {
+	_ = b[7] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v)
+	b[1] = byte(v >> 8)
+	b[2] = byte(v >> 16)
+	b[3] = byte(v >> 24)
+	b[4] = byte(v >> 32)
+	b[5] = byte(v >> 40)
+	b[6] = byte(v >> 48)
+	b[7] = byte(v >> 56)
+}
+
+func LeAppendUint64(b []byte, v uint64) []byte {
+	return append(b,
+		byte(v),
+		byte(v>>8),
+		byte(v>>16),
+		byte(v>>24),
+		byte(v>>32),
+		byte(v>>40),
+		byte(v>>48),
+		byte(v>>56),
+	)
+}
+
+func BeUint16(b []byte) uint16 {
+	_ = b[1] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint16(b[1]) | uint16(b[0])<<8
+}
+
+func BePutUint16(b []byte, v uint16) {
+	_ = b[1] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v >> 8)
+	b[1] = byte(v)
+}
+
+func BeAppendUint16(b []byte, v uint16) []byte {
+	return append(b,
+		byte(v>>8),
+		byte(v),
+	)
+}
+
+func BeUint32(b []byte) uint32 {
+	_ = b[3] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+}
+
+func BePutUint32(b []byte, v uint32) {
+	_ = b[3] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v >> 24)
+	b[1] = byte(v >> 16)
+	b[2] = byte(v >> 8)
+	b[3] = byte(v)
+}
+
+func BeAppendUint32(b []byte, v uint32) []byte {
+	return append(b,
+		byte(v>>24),
+		byte(v>>16),
+		byte(v>>8),
+		byte(v),
+	)
+}
+
+func BeUint64(b []byte) uint64 {
+	_ = b[7] // bounds check hint to compiler; see golang.org/issue/14808
+	return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
+		uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56
+}
+
+func BePutUint64(b []byte, v uint64) {
+	_ = b[7] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v >> 56)
+	b[1] = byte(v >> 48)
+	b[2] = byte(v >> 40)
+	b[3] = byte(v >> 32)
+	b[4] = byte(v >> 24)
+	b[5] = byte(v >> 16)
+	b[6] = byte(v >> 8)
+	b[7] = byte(v)
+}
+
+func BeAppendUint64(b []byte, v uint64) []byte {
+	return append(b,
+		byte(v>>56),
+		byte(v>>48),
+		byte(v>>40),
+		byte(v>>32),
+		byte(v>>24),
+		byte(v>>16),
+		byte(v>>8),
+		byte(v),
+	)
+}
--- a/src/internal/cfg/cfg.go
+++ b/src/internal/cfg/cfg.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cfg holds configuration shared by the Go command and internal/testenv.
+// Definitions that don't need to be exposed outside of cmd/go should be in
+// cmd/go/internal/cfg instead of this package.
+package cfg
+
+// KnownEnv is a list of environment variables that affect the operation
+// of the Go command.
+const KnownEnv = `
+	AR
+	CC
+	CGO_CFLAGS
+	CGO_CFLAGS_ALLOW
+	CGO_CFLAGS_DISALLOW
+	CGO_CPPFLAGS
+	CGO_CPPFLAGS_ALLOW
+	CGO_CPPFLAGS_DISALLOW
+	CGO_CXXFLAGS
+	CGO_CXXFLAGS_ALLOW
+	CGO_CXXFLAGS_DISALLOW
+	CGO_ENABLED
+	CGO_FFLAGS
+	CGO_FFLAGS_ALLOW
+	CGO_FFLAGS_DISALLOW
+	CGO_LDFLAGS
+	CGO_LDFLAGS_ALLOW
+	CGO_LDFLAGS_DISALLOW
+	CXX
+	FC
+	GCCGO
+	GO111MODULE
+	GO386
+	GOAMD64
+	GOARCH
+	GOARM
+	GOARM64
+	GOBIN
+	GOCACHE
+	GOCACHEPROG
+	GOENV
+	GOEXE
+	GOEXPERIMENT
+	GOFLAGS
+	GOGCCFLAGS
+	GOHOSTARCH
+	GOHOSTOS
+	GOINSECURE
+	GOMIPS
+	GOMIPS64
+	GOMODCACHE
+	GONOPROXY
+	GONOSUMDB
+	GOOS
+	GOPATH
+	GOPPC64
+	GOPRIVATE
+	GOPROXY
+	GORISCV64
+	GOROOT
+	GOSUMDB
+	GOTMPDIR
+	GOTOOLCHAIN
+	GOTOOLDIR
+	GOVCS
+	GOWASM
+	GOWORK
+	GO_EXTLINK_ENABLED
+	PKG_CONFIG
+`
--- a/src/internal/chacha8rand/chacha8.go
+++ b/src/internal/chacha8rand/chacha8.go
@@ -0,0 +1,160 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package chacha8rand implements a pseudorandom generator
+// based on ChaCha8. It is used by both runtime and math/rand/v2
+// and must have minimal dependencies.
+package chacha8rand
+
+import "internal/byteorder"
+
+const (
+	ctrInc = 4  // increment counter by 4 between block calls
+	ctrMax = 16 // reseed when counter reaches 16
+	chunk  = 32 // each chunk produced by block is 32 uint64s
+	reseed = 4  // reseed with 4 words
+)
+
+// block is the chacha8rand block function.
+func block(seed *[4]uint64, blocks *[32]uint64, counter uint32)
+
+// A State holds the state for a single random generator.
+// It must be used from one goroutine at a time.
+// If used by multiple goroutines at a time, the goroutines
+// may see the same random values, but the code will not
+// crash or cause out-of-bounds memory accesses.
+type State struct {
+	buf  [32]uint64
+	seed [4]uint64
+	i    uint32
+	n    uint32
+	c    uint32
+}
+
+// Next returns the next random value, along with a boolean
+// indicating whether one was available.
+// If one is not available, the caller should call Refill
+// and then repeat the call to Next.
+//
+// Next is //go:nosplit to allow its use in the runtime
+// with per-m data without holding the per-m lock.
+//
+//go:nosplit
+func (s *State) Next() (uint64, bool) {
+	i := s.i
+	if i >= s.n {
+		return 0, false
+	}
+	s.i = i + 1
+	return s.buf[i&31], true // i&31 eliminates bounds check
+}
+
+// Init seeds the State with the given seed value.
+func (s *State) Init(seed [32]byte) {
+	s.Init64([4]uint64{
+		byteorder.LeUint64(seed[0*8:]),
+		byteorder.LeUint64(seed[1*8:]),
+		byteorder.LeUint64(seed[2*8:]),
+		byteorder.LeUint64(seed[3*8:]),
+	})
+}
+
+// Init64 seeds the state with the given seed value.
+func (s *State) Init64(seed [4]uint64) {
+	s.seed = seed
+	block(&s.seed, &s.buf, 0)
+	s.c = 0
+	s.i = 0
+	s.n = chunk
+}
+
+// Refill refills the state with more random values.
+// After a call to Refill, an immediate call to Next will succeed
+// (unless multiple goroutines are incorrectly sharing a state).
+func (s *State) Refill() {
+	s.c += ctrInc
+	if s.c == ctrMax {
+		// Reseed with generated uint64s for forward secrecy.
+		// Normally this is done immediately after computing a block,
+		// but we do it immediately before computing the next block,
+		// to allow a much smaller serialized state (just the seed plus offset).
+		// This gives a delayed benefit for the forward secrecy
+		// (you can reconstruct the recent past given a memory dump),
+		// which we deem acceptable in exchange for the reduced size.
+		s.seed[0] = s.buf[len(s.buf)-reseed+0]
+		s.seed[1] = s.buf[len(s.buf)-reseed+1]
+		s.seed[2] = s.buf[len(s.buf)-reseed+2]
+		s.seed[3] = s.buf[len(s.buf)-reseed+3]
+		s.c = 0
+	}
+	block(&s.seed, &s.buf, s.c)
+	s.i = 0
+	s.n = uint32(len(s.buf))
+	if s.c == ctrMax-ctrInc {
+		s.n = uint32(len(s.buf)) - reseed
+	}
+}
+
+// Reseed reseeds the state with new random values.
+// After a call to Reseed, any previously returned random values
+// have been erased from the memory of the state and cannot be
+// recovered.
+func (s *State) Reseed() {
+	var seed [4]uint64
+	for i := range seed {
+		for {
+			x, ok := s.Next()
+			if ok {
+				seed[i] = x
+				break
+			}
+			s.Refill()
+		}
+	}
+	s.Init64(seed)
+}
+
+// Marshal marshals the state into a byte slice.
+// Marshal and Unmarshal are functions, not methods,
+// so that they will not be linked into the runtime
+// when it uses the State struct, since the runtime
+// does not need these.
+func Marshal(s *State) []byte {
+	data := make([]byte, 6*8)
+	copy(data, "chacha8:")
+	used := (s.c/ctrInc)*chunk + s.i
+	byteorder.BePutUint64(data[1*8:], uint64(used))
+	for i, seed := range s.seed {
+		byteorder.LePutUint64(data[(2+i)*8:], seed)
+	}
+	return data
+}
+
+type errUnmarshalChaCha8 struct{}
+
+func (*errUnmarshalChaCha8) Error() string {
+	return "invalid ChaCha8 encoding"
+}
+
+// Unmarshal unmarshals the state from a byte slice.
+func Unmarshal(s *State, data []byte) error {
+	if len(data) != 6*8 || string(data[:8]) != "chacha8:" {
+		return new(errUnmarshalChaCha8)
+	}
+	used := byteorder.BeUint64(data[1*8:])
+	if used > (ctrMax/ctrInc)*chunk-reseed {
+		return new(errUnmarshalChaCha8)
+	}
+	for i := range s.seed {
+		s.seed[i] = byteorder.LeUint64(data[(2+i)*8:])
+	}
+	s.c = ctrInc * (uint32(used) / chunk)
+	block(&s.seed, &s.buf, s.c)
+	s.i = uint32(used) % chunk
+	s.n = chunk
+	if s.c == ctrMax-ctrInc {
+		s.n = chunk - reseed
+	}
+	return nil
+}
--- a/src/internal/chacha8rand/chacha8_amd64.s
+++ b/src/internal/chacha8rand/chacha8_amd64.s
@@ -0,0 +1,174 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// ChaCha8 is ChaCha with 8 rounds.
+// See https://cr.yp.to/chacha/chacha-20080128.pdf.
+// See chacha8_generic.go for additional details.
+
+// ROL rotates the uint32s in register R left by N bits, using temporary T.
+#define ROL(N, R, T) \
+	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
+
+// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+#ifdef GOAMD64_v2
+#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+#else
+#define ROL16(R, T) ROL(16, R, T)
+#endif
+
+// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+#ifdef GOAMD64_v2
+#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+#else
+#define ROL8(R, T) ROL(8, R, T)
+#endif
+
+// QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary.
+#define QR(A, B, C, D, T) \
+	PADDD B, A; PXOR A, D; ROL16(D, T); \
+	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \
+	PADDD B, A; PXOR A, D; ROL8(D, T); \
+	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
+
+// REPLREG replicates the register R into 4 uint32s in XR.
+#define REPLREG(R, XR) \
+	MOVQ R, XR; \
+	PSHUFD $0, XR, XR
+
+// REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX.
+#define REPL(val, XR) \
+	MOVL $val, DX; \
+	REPLREG(DX, XR)
+
+// SEED copies the off'th uint32 of the seed into the register XR,
+// replicating it into all four stripes of the register.
+#define SEED(off, reg, XR) \
+	MOVL (4*off)(AX), reg; \
+	REPLREG(reg, XR) \
+
+// block runs 4 ChaCha8 block transformations in the four stripes of the X registers.
+
+// func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
+TEXT ·block<ABIInternal>(SB), NOSPLIT, $16
+	// seed in AX
+	// blocks in BX
+	// counter in CX
+
+	// Load initial constants into top row.
+	REPL(0x61707865, X0)
+	REPL(0x3320646e, X1)
+	REPL(0x79622d32, X2)
+	REPL(0x6b206574, X3)
+
+	// Load counter into bottom left cell.
+	// Each stripe gets a different counter: 0, 1, 2, 3.
+	// (PINSRD is not available in GOAMD64_v1,
+	// so just do it in memory on all systems.
+	// This is not on the critical path.)
+	MOVL CX, 0(SP)
+	INCL CX
+	MOVL CX, 4(SP)
+	INCL CX
+	MOVL CX, 8(SP)
+	INCL CX
+	MOVL CX, 12(SP)
+	MOVOU 0(SP), X12
+
+	// Load seed words into next two rows and into DI, SI, R8..R13
+	SEED(0, DI, X4)
+	SEED(1, SI, X5)
+	SEED(2, R8, X6)
+	SEED(3, R9, X7)
+	SEED(4, R10, X8)
+	SEED(5, R11, X9)
+	SEED(6, R12, X10)
+	SEED(7, R13, X11)
+
+	// Zeros for remaining two matrix entries.
+	// We have just enough XMM registers to hold the state,
+	// without one for the temporary, so we flush and restore
+	// some values to and from memory to provide a temporary.
+	// The initial temporary is X15, so zero its memory instead
+	// of X15 itself.
+	MOVL $0, DX
+	MOVQ DX, X13
+	MOVQ DX, X14
+	MOVOU X14, (15*16)(BX)
+
+	// 4 iterations. Each iteration is 8 quarter-rounds.
+	MOVL $4, DX
+loop:
+	QR(X0, X4, X8, X12, X15)
+	MOVOU X4, (4*16)(BX) // save X4
+	QR(X1, X5, X9, X13, X15)
+	MOVOU (15*16)(BX), X15 // reload X15; temp now X4
+	QR(X2, X6, X10, X14, X4)
+	QR(X3, X7, X11, X15, X4)
+
+	QR(X0, X5, X10, X15, X4)
+	MOVOU X15, (15*16)(BX) // save X15
+	QR(X1, X6, X11, X12, X4)
+	MOVOU (4*16)(BX), X4  // reload X4; temp now X15
+	QR(X2, X7, X8, X13, X15)
+	QR(X3, X4, X9, X14, X15)
+
+	DECL DX
+	JNZ loop
+
+	// Store interlaced blocks back to output buffer,
+	// adding original seed along the way.
+
+	// First the top and bottom rows.
+	MOVOU X0, (0*16)(BX)
+	MOVOU X1, (1*16)(BX)
+	MOVOU X2, (2*16)(BX)
+	MOVOU X3, (3*16)(BX)
+	MOVOU X12, (12*16)(BX)
+	MOVOU X13, (13*16)(BX)
+	MOVOU X14, (14*16)(BX)
+	// X15 has already been stored.
+
+	// Now we have X0-X3, X12-X15 available for temporaries.
+	// Add seed rows back to output. We left seed in DI, SI, R8..R13 above.
+	REPLREG(DI, X0)
+	REPLREG(SI, X1)
+	REPLREG(R8, X2)
+	REPLREG(R9, X3)
+	REPLREG(R10, X12)
+	REPLREG(R11, X13)
+	REPLREG(R12, X14)
+	REPLREG(R13, X15)
+	PADDD X0, X4
+	PADDD X1, X5
+	PADDD X2, X6
+	PADDD X3, X7
+	PADDD X12, X8
+	PADDD X13, X9
+	PADDD X14, X10
+	PADDD X15, X11
+	MOVOU X4, (4*16)(BX)
+	MOVOU X5, (5*16)(BX)
+	MOVOU X6, (6*16)(BX)
+	MOVOU X7, (7*16)(BX)
+	MOVOU X8, (8*16)(BX)
+	MOVOU X9, (9*16)(BX)
+	MOVOU X10, (10*16)(BX)
+	MOVOU X11, (11*16)(BX)
+
+	MOVL $0, AX
+	MOVQ AX, X15 // must be 0 on return
+
+	RET
+
+// rotate left 16 indexes for PSHUFB
+GLOBL ·rol16<>(SB), NOPTR|RODATA, $16
+DATA ·rol16<>+0(SB)/8, $0x0504070601000302
+DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A
+
+// rotate left 8 indexes for PSHUFB
+GLOBL ·rol8<>(SB), NOPTR|RODATA, $16
+DATA ·rol8<>+0(SB)/8, $0x0605040702010003
+DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B
--- a/src/internal/chacha8rand/chacha8_arm64.s
+++ b/src/internal/chacha8rand/chacha8_arm64.s
@@ -0,0 +1,104 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// QR is the ChaCha quarter-round on A, B, C, and D.
+// V30 is used as a temporary, and V31 is assumed to
+// hold the index table for rotate left 8.
+#define QR(A, B, C, D) \
+	VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16;   VREV32 D.H8, D.H8; \
+	VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $12, V30.S4, B.S4; VSRI $20, V30.S4, B.S4 \
+	VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16;   VTBL V31.B16, [D.B16], D.B16; \
+	VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL  $7, V30.S4, B.S4; VSRI $25, V30.S4, B.S4
+
+// block runs 4 ChaCha8 block transformations in the four stripes of the V registers.
+
+// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
+TEXT ·block<ABIInternal>(SB), NOSPLIT, $16
+	// seed in R0
+	// blocks in R1
+	// counter in R2
+
+	// Load initial constants into top row.
+	MOVD $·chachaConst(SB), R10
+	VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
+
+	// Load increment and rotate 8 constants into V30, V31.
+	MOVD $·chachaIncRot(SB), R11
+	VLD1 (R11), [V30.S4, V31.S4]
+
+	VLD4R.P 16(R0), [V4.S4, V5.S4, V6.S4, V7.S4]
+	VLD4R.P 16(R0), [V8.S4, V9.S4, V10.S4, V11.S4]
+
+	// store counter to memory to replicate its uint32 halfs back out
+	MOVW R2, 0(RSP)
+	VLD1R 0(RSP), [V12.S4]
+
+	// Add 0, 1, 2, 3 to counter stripes.
+	VADD V30.S4, V12.S4, V12.S4
+
+	// Zeros for remaining two matrix entries.
+	VEOR V13.B16, V13.B16, V13.B16
+	VEOR V14.B16, V14.B16, V14.B16
+	VEOR V15.B16, V15.B16, V15.B16
+
+	// Save seed state for adding back later.
+	VMOV V4.B16, V20.B16
+	VMOV V5.B16, V21.B16
+	VMOV V6.B16, V22.B16
+	VMOV V7.B16, V23.B16
+	VMOV V8.B16, V24.B16
+	VMOV V9.B16, V25.B16
+	VMOV V10.B16, V26.B16
+	VMOV V11.B16, V27.B16
+
+	// 4 iterations. Each iteration is 8 quarter-rounds.
+	MOVD $4, R0
+loop:
+	QR(V0, V4, V8, V12)
+	QR(V1, V5, V9, V13)
+	QR(V2, V6, V10, V14)
+	QR(V3, V7, V11, V15)
+
+	QR(V0, V5, V10, V15)
+	QR(V1, V6, V11, V12)
+	QR(V2, V7, V8, V13)
+	QR(V3, V4, V9, V14)
+
+	SUB $1, R0
+	CBNZ R0, loop
+
+	// Add seed back.
+	VADD V4.S4, V20.S4, V4.S4
+	VADD V5.S4, V21.S4, V5.S4
+	VADD V6.S4, V22.S4, V6.S4
+	VADD V7.S4, V23.S4, V7.S4
+	VADD V8.S4, V24.S4, V8.S4
+	VADD V9.S4, V25.S4, V9.S4
+	VADD V10.S4, V26.S4, V10.S4
+	VADD V11.S4, V27.S4, V11.S4
+
+	// Store interlaced blocks back to output buffer.
+	VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R1)
+	VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R1)
+	VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R1)
+	VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R1)
+	RET
+
+GLOBL	·chachaConst(SB), NOPTR|RODATA, $32
+DATA	·chachaConst+0x00(SB)/4, $0x61707865
+DATA	·chachaConst+0x04(SB)/4, $0x3320646e
+DATA	·chachaConst+0x08(SB)/4, $0x79622d32
+DATA	·chachaConst+0x0c(SB)/4, $0x6b206574
+
+GLOBL	·chachaIncRot(SB), NOPTR|RODATA, $32
+DATA	·chachaIncRot+0x00(SB)/4, $0x00000000
+DATA	·chachaIncRot+0x04(SB)/4, $0x00000001
+DATA	·chachaIncRot+0x08(SB)/4, $0x00000002
+DATA	·chachaIncRot+0x0c(SB)/4, $0x00000003
+DATA	·chachaIncRot+0x10(SB)/4, $0x02010003
+DATA	·chachaIncRot+0x14(SB)/4, $0x06050407
+DATA	·chachaIncRot+0x18(SB)/4, $0x0A09080B
+DATA	·chachaIncRot+0x1c(SB)/4, $0x0E0D0C0F
--- a/src/internal/chacha8rand/chacha8_generic.go
+++ b/src/internal/chacha8rand/chacha8_generic.go
@@ -0,0 +1,235 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// ChaCha8 is ChaCha with 8 rounds.
+// See https://cr.yp.to/chacha/chacha-20080128.pdf.
+//
+// ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to:
+//
+//	const1 const2 const3 const4
+//	seed   seed   seed   seed
+//	seed   seed   seed   seed
+//	counter64     0      0
+//
+// We use the same constants as ChaCha20 does, a random seed,
+// and a counter. Running ChaCha8 on this input produces
+// a 4x4 matrix of pseudo-random values with as much entropy
+// as the seed.
+//
+// Given SIMD registers that can hold N uint32s, it is possible
+// to run N ChaCha8 block transformations in parallel by filling
+// the first register with the N copies of const1, the second
+// with N copies of const2, and so on, and then running the operations.
+//
+// Each iteration of ChaCha8Rand operates over 32 bytes of input and
+// produces 992 bytes of RNG output, plus 32 bytes of input for the next
+// iteration.
+//
+// The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to
+// produce 1024 bytes of output (16 blocks, with counters 0 to 15).
+// First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32,
+// 0x6b206574 are subtracted from the 32-bit little-endian words at
+// position 0, 1, 2, and 3 respectively, and an increasing counter
+// starting at zero is subtracted from each word at position 12. Then,
+// this stream is permuted such that for each sequence of four blocks,
+// first we output the first four bytes of each block, then the next four
+// bytes of each block, and so on. Finally, the last 32 bytes of output
+// are used as the input of the next iteration, and the remaining 992
+// bytes are the RNG output.
+//
+// See https://c2sp.org/chacha8rand for additional details.
+//
+// Normal ChaCha20 implementations for encryption use this same
+// parallelism but then have to deinterlace the results so that
+// it appears the blocks were generated separately. For the purposes
+// of generating random numbers, the interlacing is fine.
+// We are simply locked in to preserving the 4-way interlacing
+// in any future optimizations.
+package chacha8rand
+
+import (
+	"internal/goarch"
+	"unsafe"
+)
+
+// setup sets up 4 ChaCha8 blocks in b32 with the counter and seed.
+// Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced
+// the same way they would be in a 4-way SIMD implementations.
+func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) {
+	// Convert to uint64 to do half as many stores to memory.
+	b := (*[16][2]uint64)(unsafe.Pointer(b32))
+
+	// Constants; same as in ChaCha20: "expand 32-byte k"
+	b[0][0] = 0x61707865_61707865
+	b[0][1] = 0x61707865_61707865
+
+	b[1][0] = 0x3320646e_3320646e
+	b[1][1] = 0x3320646e_3320646e
+
+	b[2][0] = 0x79622d32_79622d32
+	b[2][1] = 0x79622d32_79622d32
+
+	b[3][0] = 0x6b206574_6b206574
+	b[3][1] = 0x6b206574_6b206574
+
+	// Seed values.
+	var x64 uint64
+	var x uint32
+
+	x = uint32(seed[0])
+	x64 = uint64(x)<<32 | uint64(x)
+	b[4][0] = x64
+	b[4][1] = x64
+
+	x = uint32(seed[0] >> 32)
+	x64 = uint64(x)<<32 | uint64(x)
+	b[5][0] = x64
+	b[5][1] = x64
+
+	x = uint32(seed[1])
+	x64 = uint64(x)<<32 | uint64(x)
+	b[6][0] = x64
+	b[6][1] = x64
+
+	x = uint32(seed[1] >> 32)
+	x64 = uint64(x)<<32 | uint64(x)
+	b[7][0] = x64
+	b[7][1] = x64
+
+	x = uint32(seed[2])
+	x64 = uint64(x)<<32 | uint64(x)
+	b[8][0] = x64
+	b[8][1] = x64
+
+	x = uint32(seed[2] >> 32)
+	x64 = uint64(x)<<32 | uint64(x)
+	b[9][0] = x64
+	b[9][1] = x64
+
+	x = uint32(seed[3])
+	x64 = uint64(x)<<32 | uint64(x)
+	b[10][0] = x64
+	b[10][1] = x64
+
+	x = uint32(seed[3] >> 32)
+	x64 = uint64(x)<<32 | uint64(x)
+	b[11][0] = x64
+	b[11][1] = x64
+
+	// Counters.
+	if goarch.BigEndian {
+		b[12][0] = uint64(counter+0)<<32 | uint64(counter+1)
+		b[12][1] = uint64(counter+2)<<32 | uint64(counter+3)
+	} else {
+		b[12][0] = uint64(counter+0) | uint64(counter+1)<<32
+		b[12][1] = uint64(counter+2) | uint64(counter+3)<<32
+	}
+
+	// Zeros.
+	b[13][0] = 0
+	b[13][1] = 0
+	b[14][0] = 0
+	b[14][1] = 0
+
+	b[15][0] = 0
+	b[15][1] = 0
+}
+
+func _() {
+	// block and block_generic must have same type
+	x := block
+	x = block_generic
+	_ = x
+}
+
+// block_generic is the non-assembly block implementation,
+// for use on systems without special assembly.
+// Even on such systems, it is quite fast: on GOOS=386,
+// ChaCha8 using this code generates random values faster than PCG-DXSM.
+func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) {
+	b := (*[16][4]uint32)(unsafe.Pointer(buf))
+
+	setup(seed, b, counter)
+
+	for i := range b[0] {
+		// Load block i from b[*][i] into local variables.
+		b0 := b[0][i]
+		b1 := b[1][i]
+		b2 := b[2][i]
+		b3 := b[3][i]
+		b4 := b[4][i]
+		b5 := b[5][i]
+		b6 := b[6][i]
+		b7 := b[7][i]
+		b8 := b[8][i]
+		b9 := b[9][i]
+		b10 := b[10][i]
+		b11 := b[11][i]
+		b12 := b[12][i]
+		b13 := b[13][i]
+		b14 := b[14][i]
+		b15 := b[15][i]
+
+		// 4 iterations of eight quarter-rounds each is 8 rounds
+		for round := 0; round < 4; round++ {
+			b0, b4, b8, b12 = qr(b0, b4, b8, b12)
+			b1, b5, b9, b13 = qr(b1, b5, b9, b13)
+			b2, b6, b10, b14 = qr(b2, b6, b10, b14)
+			b3, b7, b11, b15 = qr(b3, b7, b11, b15)
+
+			b0, b5, b10, b15 = qr(b0, b5, b10, b15)
+			b1, b6, b11, b12 = qr(b1, b6, b11, b12)
+			b2, b7, b8, b13 = qr(b2, b7, b8, b13)
+			b3, b4, b9, b14 = qr(b3, b4, b9, b14)
+		}
+
+		// Store block i back into b[*][i].
+		// Add b4..b11 back to the original key material,
+		// like in ChaCha20, to avoid trivial invertibility.
+		// There is no entropy in b0..b3 and b12..b15
+		// so we can skip the additions and save some time.
+		b[0][i] = b0
+		b[1][i] = b1
+		b[2][i] = b2
+		b[3][i] = b3
+		b[4][i] += b4
+		b[5][i] += b5
+		b[6][i] += b6
+		b[7][i] += b7
+		b[8][i] += b8
+		b[9][i] += b9
+		b[10][i] += b10
+		b[11][i] += b11
+		b[12][i] = b12
+		b[13][i] = b13
+		b[14][i] = b14
+		b[15][i] = b15
+	}
+
+	if goarch.BigEndian {
+		// On a big-endian system, reading the uint32 pairs as uint64s
+		// will word-swap them compared to little-endian, so we word-swap
+		// them here first to make the next swap get the right answer.
+		for i, x := range buf {
+			buf[i] = x>>32 | x<<32
+		}
+	}
+}
+
+// qr is the (inlinable) ChaCha8 quarter round.
+func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) {
+	a += b
+	d ^= a
+	d = d<<16 | d>>16
+	c += d
+	b ^= c
+	b = b<<12 | b>>20
+	a += b
+	d ^= a
+	d = d<<8 | d>>24
+	c += d
+	b ^= c
+	b = b<<7 | b>>25
+	return a, b, c, d
+}
--- a/src/internal/chacha8rand/chacha8_stub.s
+++ b/src/internal/chacha8rand/chacha8_stub.s
@@ -0,0 +1,12 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm64
+
+#include "textflag.h"
+
+// func block(counter uint64, seed *[8]uint32, blocks *[16][4]uint32)
+TEXT ·block(SB), NOSPLIT, $0
+	JMP ·block_generic(SB)
+
--- a/src/internal/chacha8rand/export_test.go
+++ b/src/internal/chacha8rand/export_test.go
@@ -0,0 +1,12 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package chacha8rand
+
+var Block = block
+var Block_generic = block_generic
+
+func Seed(s *State) [4]uint64 {
+	return s.seed
+}
--- a/src/internal/chacha8rand/rand_test.go
+++ b/src/internal/chacha8rand/rand_test.go
@@ -0,0 +1,202 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package chacha8rand_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	. "internal/chacha8rand"
+	"slices"
+	"testing"
+)
+
+func TestOutput(t *testing.T) {
+	var s State
+	s.Init(seed)
+	for i := range output {
+		for {
+			x, ok := s.Next()
+			if ok {
+				if x != output[i] {
+					t.Errorf("#%d: have %#x want %#x", i, x, output[i])
+				}
+				break
+			}
+			s.Refill()
+		}
+	}
+}
+
+func TestMarshal(t *testing.T) {
+	var s State
+	s.Init(seed)
+	for i := range output {
+		for {
+			b := Marshal(&s)
+			s = State{}
+			err := Unmarshal(&s, b)
+			if err != nil {
+				t.Fatalf("#%d: Unmarshal: %v", i, err)
+			}
+			x, ok := s.Next()
+			if ok {
+				if x != output[i] {
+					t.Fatalf("#%d: have %#x want %#x", i, x, output[i])
+				}
+				break
+			}
+			s.Refill()
+		}
+	}
+}
+
+func TestReseed(t *testing.T) {
+	var s State
+	s.Init(seed)
+	old := Seed(&s)
+	s.Reseed()
+	if Seed(&s) == old {
+		t.Errorf("Reseed did not change seed")
+	}
+}
+
+func BenchmarkBlock(b *testing.B) {
+	var seed [4]uint64
+	var blocks [32]uint64
+
+	for i := 0; i < b.N; i++ {
+		Block(&seed, &blocks, 0)
+	}
+	b.SetBytes(32 * 8)
+}
+
+func TestBlockGeneric(t *testing.T) {
+	var b1, b2 [32]uint64
+	s := seed // byte seed
+	seed := [4]uint64{
+		binary.LittleEndian.Uint64(s[0*8:]),
+		binary.LittleEndian.Uint64(s[1*8:]),
+		binary.LittleEndian.Uint64(s[2*8:]),
+		binary.LittleEndian.Uint64(s[3*8:]),
+	}
+
+	Block(&seed, &b1, 4)
+	Block_generic(&seed, &b2, 4)
+	if !slices.Equal(b1[:], b2[:]) {
+		var out bytes.Buffer
+		fmt.Fprintf(&out, "%-18s %-18s\n", "block", "block_generic")
+		for i := range b1 {
+			suffix := ""
+			if b1[i] != b2[i] {
+				suffix = " mismatch!"
+			}
+			fmt.Fprintf(&out, "%#016x %#016x%s\n", b1[i], b2[i], suffix)
+		}
+		t.Errorf("block and block_generic disagree:\n%s", out.String())
+	}
+}
+
+// Golden output test to make sure algorithm never changes,
+// so that its use in math/rand/v2 stays stable.
+// See https://c2sp.org/chacha8rand.
+
+var seed = [32]byte([]byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"))
+
+var output = []uint64{
+	0xb773b6063d4616a5, 0x1160af22a66abc3c, 0x8c2599d9418d287c, 0x7ee07e037edc5cd6,
+	0xcfaa9ee02d1c16ad, 0x0e090eef8febea79, 0x3c82d271128b5b3e, 0x9c5addc11252a34f,
+	0xdf79bb617d6ceea6, 0x36d553591f9d736a, 0xeef0d14e181ee01f, 0x089bfc760ae58436,
+	0xd9e52b59cc2ad268, 0xeb2fb4444b1b8aba, 0x4f95c8a692c46661, 0xc3c6323217cae62c,
+	0x91ebb4367f4e2e7e, 0x784cf2c6a0ec9bc6, 0x5c34ec5c34eabe20, 0x4f0a8f515570daa8,
+	0xfc35dcb4113d6bf2, 0x5b0da44c645554bc, 0x6d963da3db21d9e1, 0xeeaefc3150e500f3,
+	0x2d37923dda3750a5, 0x380d7a626d4bc8b0, 0xeeaf68ede3d7ee49, 0xf4356695883b717c,
+	0x846a9021392495a4, 0x8e8510549630a61b, 0x18dc02545dbae493, 0x0f8f9ff0a65a3d43,
+	0xccf065f7190ff080, 0xfd76d1aa39673330, 0x95d232936cba6433, 0x6c7456d1070cbd17,
+	0x462acfdaff8c6562, 0x5bafab866d34fc6a, 0x0c862f78030a2988, 0xd39a83e407c3163d,
+	0xc00a2b7b45f22ebf, 0x564307c62466b1a9, 0x257e0424b0c072d4, 0x6fb55e99496c28fe,
+	0xae9873a88f5cd4e0, 0x4657362ac60d3773, 0x1c83f91ecdf23e8e, 0x6fdc0792c15387c0,
+	0x36dad2a30dfd2b5c, 0xa4b593290595bdb7, 0x4de18934e4cc02c5, 0xcdc0d604f015e3a7,
+	0xfba0dbf69ad80321, 0x60e8bea3d139de87, 0xd18a4d851ef48756, 0x6366447c2215f34a,
+	0x05682e97d3d007ee, 0x4c0e8978c6d54ab2, 0xcf1e9f6a6712edc2, 0x061439414c80cfd3,
+	0xd1a8b6e2745c0ead, 0x31a7918d45c410e8, 0xabcc61ad90216eec, 0x4040d92d2032a71a,
+	0x3cd2f66ffb40cd68, 0xdcd051c07295857a, 0xeab55cbcd9ab527e, 0x18471dce781bdaac,
+	0xf7f08cd144dc7252, 0x5804e0b13d7f40d1, 0x5cb1a446e4b2d35b, 0xe6d4a728d2138a06,
+	0x05223e40ca60dad8, 0x2d61ec3206ac6a68, 0xab692356874c17b8, 0xc30954417676de1c,
+	0x4f1ace3732225624, 0xfba9510813988338, 0x997f200f52752e11, 0x1116aaafe86221fa,
+	0x07ce3b5cb2a13519, 0x2956bc72bc458314, 0x4188b7926140eb78, 0x56ca6dbfd4adea4d,
+	0x7fe3c22349340ce5, 0x35c08f9c37675f8a, 0x11e1c7fbef5ed521, 0x98adc8464ec1bc75,
+	0xd163b2c73d1203f8, 0x8c761ee043a2f3f3, 0x24b99d6accecd7b7, 0x793e31aa112f0370,
+	0x8e87dc2a19285139, 0x4247ae04f7096e25, 0x514f3122926fe20f, 0xdc6fb3f045d2a7e9,
+	0x15cb30cecdd18eba, 0xcbc7fdecf6900274, 0x3fb5c696dc8ba021, 0xd1664417c8d274e6,
+	0x05f7e445ea457278, 0xf920bbca1b9db657, 0x0c1950b4da22cb99, 0xf875baf1af09e292,
+	0xbed3d7b84250f838, 0xf198e8080fd74160, 0xc9eda51d9b7ea703, 0xf709ef55439bf8f6,
+	0xd20c74feebf116fc, 0x305668eb146d7546, 0x829af3ec10d89787, 0x15b8f9697b551dbc,
+	0xfc823c6c8e64b8c9, 0x345585e8183b40bc, 0x674b4171d6581368, 0x1234d81cd670e9f7,
+	0x0e505210d8a55e19, 0xe8258d69eeeca0dc, 0x05d4c452e8baf67e, 0xe8dbe30116a45599,
+	0x1cf08ce1b1176f00, 0xccf7d0a4b81ecb49, 0x303fea136b2c430e, 0x861d6c139c06c871,
+	0x5f41df72e05e0487, 0x25bd7e1e1ae26b1d, 0xbe9f4004d662a41d, 0x65bf58d483188546,
+	0xd1b27cff69db13cc, 0x01a6663372c1bb36, 0x578dd7577b727f4d, 0x19c78f066c083cf6,
+	0xdbe014d4f9c391bb, 0x97fbb2dd1d13ffb3, 0x31c91e0af9ef8d4f, 0x094dfc98402a43ba,
+	0x069bd61bea37b752, 0x5b72d762e8d986ca, 0x72ee31865904bc85, 0xd1f5fdc5cd36c33e,
+	0xba9b4980a8947cad, 0xece8f05eac49ab43, 0x65fe1184abae38e7, 0x2d7cb9dea5d31452,
+	0xcc71489476e467e3, 0x4c03a258a578c68c, 0x00efdf9ecb0fd8fc, 0x9924cad471e2666d,
+	0x87f8668318f765e9, 0xcb4dc57c1b55f5d8, 0xd373835a86604859, 0xe526568b5540e482,
+	0x1f39040f08586fec, 0xb764f3f00293f8e6, 0x049443a2f6bd50a8, 0x76fec88697d3941a,
+	0x3efb70d039bae7a2, 0xe2f4611368eca8a8, 0x7c007a96e01d2425, 0xbbcce5768e69c5bf,
+	0x784fb4985c42aac3, 0xf72b5091aa223874, 0x3630333fb1e62e07, 0x8e7319ebdebbb8de,
+	0x2a3982bca959fa00, 0xb2b98b9f964ba9b3, 0xf7e31014adb71951, 0xebd0fca3703acc82,
+	0xec654e2a2fe6419a, 0xb326132d55a52e2c, 0x2248c57f44502978, 0x32710c2f342daf16,
+	0x0517b47b5acb2bec, 0x4c7a718fca270937, 0xd69142bed0bcc541, 0xe40ebcb8ff52ce88,
+	0x3e44a2dbc9f828d4, 0xc74c2f4f8f873f58, 0x3dbf648eb799e45b, 0x33f22475ee0e86f8,
+	0x1eb4f9ee16d47f65, 0x40f8d2b8712744e3, 0xb886b4da3cb14572, 0x2086326fbdd6f64d,
+	0xcc3de5907dd882b9, 0xa2e8b49a5ee909df, 0xdbfb8e7823964c10, 0x70dd6089ef0df8d5,
+	0x30141663cdd9c99f, 0x04b805325c240365, 0x7483d80314ac12d6, 0x2b271cb91aa7f5f9,
+	0x97e2245362abddf0, 0x5a84f614232a9fab, 0xf71125fcda4b7fa2, 0x1ca5a61d74b27267,
+	0x38cc6a9b3adbcb45, 0xdde1bb85dc653e39, 0xe9d0c8fa64f89fd4, 0x02c5fb1ecd2b4188,
+	0xf2bd137bca5756e5, 0xadefe25d121be155, 0x56cd1c3c5d893a8e, 0x4c50d337beb65bb9,
+	0x918c5151675cf567, 0xaba649ffcfb56a1e, 0x20c74ab26a2247cd, 0x71166bac853c08da,
+	0xb07befe2e584fc5d, 0xda45ff2a588dbf32, 0xdb98b03c4d75095e, 0x60285ae1aaa65a4c,
+	0xf93b686a263140b8, 0xde469752ee1c180e, 0xcec232dc04129aae, 0xeb916baa1835ea04,
+	0xd49c21c8b64388ff, 0x72a82d9658864888, 0x003348ef7eac66a8, 0x7f6f67e655b209eb,
+	0x532ffb0b7a941b25, 0xd940ade6128deede, 0xdf24f2a1af89fe23, 0x95aa3b4988195ae0,
+	0x3da649404f94be4a, 0x692dad132c3f7e27, 0x40aee76ecaaa9eb8, 0x1294a01e09655024,
+	0x6df797abdba4e4f5, 0xea2fb6024c1d7032, 0x5f4e0492295489fc, 0x57972914ea22e06a,
+	0x9a8137d133aad473, 0xa2e6dd6ae7cdf2f3, 0x9f42644f18086647, 0x16d03301c170bd3e,
+	0x908c416fa546656d, 0xe081503be22e123e, 0x077cf09116c4cc72, 0xcbd25cd264b7f229,
+	0x3db2f468ec594031, 0x46c00e734c9badd5, 0xd0ec0ac72075d861, 0x3037cb3cf80b7630,
+	0x574c3d7b3a2721c6, 0xae99906a0076824b, 0xb175a5418b532e70, 0xd8b3e251ee231ddd,
+	0xb433eec25dca1966, 0x530f30dc5cff9a93, 0x9ff03d98b53cd335, 0xafc4225076558cdf,
+	0xef81d3a28284402a, 0x110bdbf51c110a28, 0x9ae1b255d027e8f6, 0x7de3e0aa24688332,
+	0xe483c3ecd2067ee2, 0xf829328b276137e6, 0xa413ccad57562cad, 0xe6118e8b496acb1f,
+	0x8288dca6da5ec01f, 0xa53777dc88c17255, 0x8a00f1e0d5716eda, 0x618e6f47b7a720a8,
+	0x9e3907b0c692a841, 0x978b42ca963f34f3, 0x75e4b0cd98a7d7ef, 0xde4dbd6e0b5f4752,
+	0x0252e4153f34493f, 0x50f0e7d803734ef9, 0x237766a38ed167ee, 0x4124414001ee39a0,
+	0xd08df643e535bb21, 0x34f575b5a9a80b74, 0x2c343af87297f755, 0xcd8b6d99d821f7cb,
+	0xe376fd7256fc48ae, 0xe1b06e7334352885, 0xfa87b26f86c169eb, 0x36c1604665a971de,
+	0xdba147c2239c8e80, 0x6b208e69fc7f0e24, 0x8795395b6f2b60c3, 0x05dabee9194907f4,
+	0xb98175142f5ed902, 0x5e1701e2021ddc81, 0x0875aba2755eed08, 0x778d83289251de95,
+	0x3bfbe46a039ecb31, 0xb24704fce4cbd7f9, 0x6985ffe9a7c91e3d, 0xc8efb13df249dabb,
+	0xb1037e64b0f4c9f6, 0x55f69fd197d6b7c3, 0x672589d71d68a90c, 0xbebdb8224f50a77e,
+	0x3f589f80007374a7, 0xd307f4635954182a, 0xcff5850c10d4fd90, 0xc6da02dfb6408e15,
+	0x93daeef1e2b1a485, 0x65d833208aeea625, 0xe2b13fa13ed3b5fa, 0x67053538130fb68e,
+	0xc1042f6598218fa9, 0xee5badca749b8a2e, 0x6d22a3f947dae37d, 0xb62c6d1657f4dbaf,
+	0x6e007de69704c20b, 0x1af2b913fc3841d8, 0xdc0e47348e2e8e22, 0x9b1ddef1cf958b22,
+	0x632ed6b0233066b8, 0xddd02d3311bed8f2, 0xf147cfe1834656e9, 0x399aaa49d511597a,
+	0x6b14886979ec0309, 0x64fc4ac36b5afb97, 0xb82f78e07f7cf081, 0x10925c9a323d0e1b,
+	0xf451c79ee13c63f6, 0x7c2fc180317876c7, 0x35a12bd9eecb7d22, 0x335654a539621f90,
+	0xcc32a3f35db581f0, 0xc60748a80b2369cb, 0x7c4dd3b08591156b, 0xac1ced4b6de22291,
+	0xa32cfa2df134def5, 0x627108918dea2a53, 0x0555b1608fcb4ff4, 0x143ee7ac43aaa33c,
+	0xdae90ce7cf4fc218, 0x4d68fc2582bcf4b5, 0x37094e1849135d71, 0xf7857e09f3d49fd8,
+	0x007538c503768be7, 0xedf648ba2f6be601, 0xaa347664dd72513e, 0xbe63893c6ef23b86,
+	0x130b85710605af97, 0xdd765c6b1ef6ab56, 0xf3249a629a97dc6b, 0x2a114f9020fab8e5,
+	0x5a69e027cfc6ad08, 0x3c4ccb36f1a5e050, 0x2e9e7d596834f0a5, 0x2430be6858fce789,
+	0xe90b862f2466e597, 0x895e2884f159a9ec, 0x26ab8fa4902fcb57, 0xa6efff5c54e1fa50,
+	0x333ac4e5811a8255, 0xa58d515f02498611, 0xfe5a09dcb25c6ef4, 0x03898988ab5f5818,
+	0x289ff6242af6c617, 0x3d9dd59fd381ea23, 0x52d7d93d8a8aae51, 0xc76a123d511f786f,
+	0xf68901edaf00c46c, 0x8c630871b590de80, 0x05209c308991e091, 0x1f809f99b4788177,
+	0x11170c2eb6c19fd8, 0x44433c779062ba58, 0xc0acb51af1874c45, 0x9f2e134284809fa1,
+	0xedb523bd15c619fa, 0x02d97fd53ecc23c0, 0xacaf05a34462374c, 0xddd9c6d34bffa11f,
+}
--- a/Show More
+++ b/Show More