Bump crypto package. (#492)

This fixes master build for certain CPU's. See https://github.com/golang/crypto/commit/ae8bce0
2019-01-08 12:30:18 -08:00
parent 323c02d468
commit d8736e23f5
29 changed files with 1790 additions and 86 deletions
--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -1344,10 +1344,11 @@

 [[projects]]
  branch = "master"
-  digest = "1:4ad75195b51cdd9b2a075c389e9d507394bc33b82d3ff8d9a32107e8fe9a391e"
+  digest = "1:4906a472dae724cd9a033dd55eba6e3ad0deb45e76e054697b2739786459a62f"
  name = "golang.org/x/crypto"
  packages = [
    "blake2b",
+    "internal/subtle",
    "nacl/secretbox",
    "pkcs12",
    "pkcs12/internal/rc2",
@@ -1356,7 +1357,7 @@
    "ssh/terminal",
  ]
  pruneopts = "NUT"
-  revision = "94eea52f7b742c7cbe0b03b22f0c4c8631ece122"
+  revision = "ff983b9c42bc9fbf91556e191cc8efb585c16908"

 [[projects]]
  branch = "master"
@@ -1402,9 +1403,10 @@

 [[projects]]
  branch = "master"
-  digest = "1:ee40f9506736b3b55162b22d82911b797301904848b1f1ae5db49f561ad48a79"
+  digest = "1:e20cbdb894c1abfe36062069599289d5b8b1d77be5e940eae0cb02f3f4b92377"
  name = "golang.org/x/sys"
  packages = [
+    "cpu",
    "unix",
    "windows",
  ]
--- a/vendor/golang.org/x/crypto/blake2b/blake2b.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b.go
@@ -39,7 +39,10 @@ var (
 	useSSE4 bool
 )

-var errKeySize = errors.New("blake2b: invalid key size")
+var (
+	errKeySize  = errors.New("blake2b: invalid key size")
+	errHashSize = errors.New("blake2b: invalid hash size")
+)

 var iv = [8]uint64{
 	0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
@@ -83,7 +86,20 @@ func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) }
 // key turns the hash into a MAC. The key must between zero and 64 bytes long.
 func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }

+// New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
+// A non-nil key turns the hash into a MAC. The key must between zero and 64 bytes long.
+// The hash size can be a value between 1 and 64 but it is highly recommended to use
+// values equal or greater than:
+// - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
+// - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
+// When the key is nil, the returned hash.Hash implements BinaryMarshaler
+// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
+func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
+
 func newDigest(hashSize int, key []byte) (*digest, error) {
+	if hashSize < 1 || hashSize > Size {
+		return nil, errHashSize
+	}
 	if len(key) > Size {
 		return nil, errKeySize
 	}
@@ -136,6 +152,50 @@ type digest struct {
 	keyLen int
 }

+const (
+	magic         = "b2b"
+	marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1
+)
+
+func (d *digest) MarshalBinary() ([]byte, error) {
+	if d.keyLen != 0 {
+		return nil, errors.New("crypto/blake2b: cannot marshal MACs")
+	}
+	b := make([]byte, 0, marshaledSize)
+	b = append(b, magic...)
+	for i := 0; i < 8; i++ {
+		b = appendUint64(b, d.h[i])
+	}
+	b = appendUint64(b, d.c[0])
+	b = appendUint64(b, d.c[1])
+	// Maximum value for size is 64
+	b = append(b, byte(d.size))
+	b = append(b, d.block[:]...)
+	b = append(b, byte(d.offset))
+	return b, nil
+}
+
+func (d *digest) UnmarshalBinary(b []byte) error {
+	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+		return errors.New("crypto/blake2b: invalid hash state identifier")
+	}
+	if len(b) != marshaledSize {
+		return errors.New("crypto/blake2b: invalid hash state size")
+	}
+	b = b[len(magic):]
+	for i := 0; i < 8; i++ {
+		b, d.h[i] = consumeUint64(b)
+	}
+	b, d.c[0] = consumeUint64(b)
+	b, d.c[1] = consumeUint64(b)
+	d.size = int(b[0])
+	b = b[1:]
+	copy(d.block[:], b[:BlockSize])
+	b = b[BlockSize:]
+	d.offset = int(b[0])
+	return nil
+}
+
 func (d *digest) BlockSize() int { return BlockSize }

 func (d *digest) Size() int { return d.size }
@@ -205,3 +265,25 @@ func (d *digest) finalize(hash *[Size]byte) {
 		binary.LittleEndian.PutUint64(hash[8*i:], v)
 	}
 }
+
+func appendUint64(b []byte, x uint64) []byte {
+	var a [8]byte
+	binary.BigEndian.PutUint64(a[:], x)
+	return append(b, a[:]...)
+}
+
+func appendUint32(b []byte, x uint32) []byte {
+	var a [4]byte
+	binary.BigEndian.PutUint32(a[:], x)
+	return append(b, a[:]...)
+}
+
+func consumeUint64(b []byte) ([]byte, uint64) {
+	x := binary.BigEndian.Uint64(b)
+	return b[8:], x
+}
+
+func consumeUint32(b []byte) ([]byte, uint32) {
+	x := binary.BigEndian.Uint32(b)
+	return b[4:], x
+}
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go
@@ -6,21 +6,14 @@

 package blake2b

+import "golang.org/x/sys/cpu"
+
 func init() {
-	useAVX2 = supportsAVX2()
-	useAVX = supportsAVX()
-	useSSE4 = supportsSSE4()
+	useAVX2 = cpu.X86.HasAVX2
+	useAVX = cpu.X86.HasAVX
+	useSSE4 = cpu.X86.HasSSE41
 }

-//go:noescape
-func supportsSSE4() bool
-
-//go:noescape
-func supportsAVX() bool
-
-//go:noescape
-func supportsAVX2() bool
-
 //go:noescape
 func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)

@@ -31,13 +24,14 @@ func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)

 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
-	if useAVX2 {
+	switch {
+	case useAVX2:
 		hashBlocksAVX2(h, c, flag, blocks)
-	} else if useAVX {
+	case useAVX:
 		hashBlocksAVX(h, c, flag, blocks)
-	} else if useSSE4 {
+	case useSSE4:
 		hashBlocksSSE4(h, c, flag, blocks)
-	} else {
+	default:
 		hashBlocksGeneric(h, c, flag, blocks)
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
@@ -748,15 +748,3 @@ noinc:

 	MOVQ BP, SP
 	RET
-
-// func supportsAVX2() bool
-TEXT ·supportsAVX2(SB), 4, $0-1
-	MOVQ runtime·support_avx2(SB), AX
-	MOVB AX, ret+0(FP)
-	RET
-
-// func supportsAVX() bool
-TEXT ·supportsAVX(SB), 4, $0-1
-	MOVQ runtime·support_avx(SB), AX
-	MOVB AX, ret+0(FP)
-	RET
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go
@@ -6,12 +6,11 @@

 package blake2b

-func init() {
-	useSSE4 = supportsSSE4()
-}
+import "golang.org/x/sys/cpu"

-//go:noescape
-func supportsSSE4() bool
+func init() {
+	useSSE4 = cpu.X86.HasSSE41
+}

 //go:noescape
 func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
@@ -279,12 +279,3 @@ noinc:

 	MOVQ BP, SP
 	RET
-
-// func supportsSSE4() bool
-TEXT ·supportsSSE4(SB), 4, $0-1
-	MOVL $1, AX
-	CPUID
-	SHRL $19, CX  // Bit 19 indicates SSE4 support
-	ANDL $1, CX  // CX != 0 if support SSE4
-	MOVB CX, ret+0(FP)
-	RET
--- a/vendor/golang.org/x/crypto/blake2b/blake2x.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2x.go
@@ -29,7 +29,7 @@ type XOF interface {
 }

 // OutputLengthUnknown can be used as the size argument to NewXOF to indicate
-// the the length of the output is not known in advance.
+// the length of the output is not known in advance.
 const OutputLengthUnknown = 0

 // magicUnknownOutputLength is a magic value for the output size that indicates
--- a/vendor/golang.org/x/crypto/internal/subtle/aliasing.go
+++ b/vendor/golang.org/x/crypto/internal/subtle/aliasing.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+
+// Package subtle implements functions that are often useful in cryptographic
+// code but require careful thought to use correctly.
+package subtle // import "golang.org/x/crypto/internal/subtle"
+
+import "unsafe"
+
+// AnyOverlap reports whether x and y share memory at any (not necessarily
+// corresponding) index. The memory beyond the slice length is ignored.
+func AnyOverlap(x, y []byte) bool {
+	return len(x) > 0 && len(y) > 0 &&
+		uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) &&
+		uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1]))
+}
+
+// InexactOverlap reports whether x and y share memory at any non-corresponding
+// index. The memory beyond the slice length is ignored. Note that x and y can
+// have different lengths and still not have any inexact overlap.
+//
+// InexactOverlap can be used to implement the requirements of the crypto/cipher
+// AEAD, Block, BlockMode and Stream interfaces.
+func InexactOverlap(x, y []byte) bool {
+	if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
+		return false
+	}
+	return AnyOverlap(x, y)
+}
--- a/vendor/golang.org/x/crypto/internal/subtle/aliasing_appengine.go
+++ b/vendor/golang.org/x/crypto/internal/subtle/aliasing_appengine.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build appengine
+
+// Package subtle implements functions that are often useful in cryptographic
+// code but require careful thought to use correctly.
+package subtle // import "golang.org/x/crypto/internal/subtle"
+
+// This is the Google App Engine standard variant based on reflect
+// because the unsafe package and cgo are disallowed.
+
+import "reflect"
+
+// AnyOverlap reports whether x and y share memory at any (not necessarily
+// corresponding) index. The memory beyond the slice length is ignored.
+func AnyOverlap(x, y []byte) bool {
+	return len(x) > 0 && len(y) > 0 &&
+		reflect.ValueOf(&x[0]).Pointer() <= reflect.ValueOf(&y[len(y)-1]).Pointer() &&
+		reflect.ValueOf(&y[0]).Pointer() <= reflect.ValueOf(&x[len(x)-1]).Pointer()
+}
+
+// InexactOverlap reports whether x and y share memory at any non-corresponding
+// index. The memory beyond the slice length is ignored. Note that x and y can
+// have different lengths and still not have any inexact overlap.
+//
+// InexactOverlap can be used to implement the requirements of the crypto/cipher
+// AEAD, Block, BlockMode and Stream interfaces.
+func InexactOverlap(x, y []byte) bool {
+	if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
+		return false
+	}
+	return AnyOverlap(x, y)
+}
--- a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
+++ b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
@@ -35,6 +35,7 @@ This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html.
 package secretbox // import "golang.org/x/crypto/nacl/secretbox"

 import (
+	"golang.org/x/crypto/internal/subtle"
 	"golang.org/x/crypto/poly1305"
 	"golang.org/x/crypto/salsa20/salsa"
 )
@@ -87,6 +88,9 @@ func Seal(out, message []byte, nonce *[24]byte, key *[32]byte) []byte {
 	copy(poly1305Key[:], firstBlock[:])

 	ret, out := sliceForAppend(out, len(message)+poly1305.TagSize)
+	if subtle.AnyOverlap(out, message) {
+		panic("nacl: invalid buffer overlap")
+	}

 	// We XOR up to 32 bytes of message with the keystream generated from
 	// the first block.
@@ -118,7 +122,7 @@ func Seal(out, message []byte, nonce *[24]byte, key *[32]byte) []byte {
 // Open authenticates and decrypts a box produced by Seal and appends the
 // message to out, which must not overlap box. The output will be Overhead
 // bytes smaller than box.
-func Open(out []byte, box []byte, nonce *[24]byte, key *[32]byte) ([]byte, bool) {
+func Open(out, box []byte, nonce *[24]byte, key *[32]byte) ([]byte, bool) {
 	if len(box) < Overhead {
 		return nil, false
 	}
@@ -143,6 +147,9 @@ func Open(out []byte, box []byte, nonce *[24]byte, key *[32]byte) ([]byte, bool)
 	}

 	ret, out := sliceForAppend(out, len(box)-Overhead)
+	if subtle.AnyOverlap(out, box) {
+		panic("nacl: invalid buffer overlap")
+	}

 	// We XOR up to 32 bytes of box with the keystream generated from
 	// the first block.
--- a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
@@ -0,0 +1,14 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x,!go1.11 !arm,!amd64,!s390x gccgo appengine nacl
+
+package poly1305
+
+// Sum generates an authenticator for msg using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
+	sumGeneric(out, msg, key)
+}
--- a/vendor/golang.org/x/crypto/poly1305/sum_ref.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ref.go
@@ -2,16 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64,!arm gccgo appengine nacl
-
 package poly1305

 import "encoding/binary"

-// Sum generates an authenticator for msg using a one-time key and puts the
-// 16-byte result into out. Authenticating two different messages with the same
-// key allows an attacker to forge messages at will.
-func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
+// sumGeneric generates an authenticator for msg using a one-time key and
+// puts the 16-byte result into out. This is the generic implementation of
+// Sum and should be called if no assembly implementation is available.
+func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	var (
 		h0, h1, h2, h3, h4 uint32 // the hash accumulators
 		r0, r1, r2, r3, r4 uint64 // the r part of the key
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
@@ -0,0 +1,49 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x,go1.11,!gccgo,!appengine
+
+package poly1305
+
+// hasVectorFacility reports whether the machine supports
+// the vector facility (vx).
+func hasVectorFacility() bool
+
+// hasVMSLFacility reports whether the machine supports
+// Vector Multiply Sum Logical (VMSL).
+func hasVMSLFacility() bool
+
+var hasVX = hasVectorFacility()
+var hasVMSL = hasVMSLFacility()
+
+// poly1305vx is an assembly implementation of Poly1305 that uses vector
+// instructions. It must only be called if the vector facility (vx) is
+// available.
+//go:noescape
+func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+
+// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
+// instructions, including VMSL. It must only be called if the vector facility (vx) is
+// available and if VMSL is supported.
+//go:noescape
+func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+
+// Sum generates an authenticator for m using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[16]byte, m []byte, key *[32]byte) {
+	if hasVX {
+		var mPtr *byte
+		if len(m) > 0 {
+			mPtr = &m[0]
+		}
+		if hasVMSL && len(m) > 256 {
+			poly1305vmsl(out, mPtr, uint64(len(m)), key)
+		} else {
+			poly1305vx(out, mPtr, uint64(len(m)), key)
+		}
+	} else {
+		sumGeneric(out, m, key)
+	}
+}
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
@@ -0,0 +1,400 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x,go1.11,!gccgo,!appengine
+
+#include "textflag.h"
+
+// Implementation of Poly1305 using the vector facility (vx).
+
+// constants
+#define MOD26 V0
+#define EX0   V1
+#define EX1   V2
+#define EX2   V3
+
+// temporaries
+#define T_0 V4
+#define T_1 V5
+#define T_2 V6
+#define T_3 V7
+#define T_4 V8
+
+// key (r)
+#define R_0  V9
+#define R_1  V10
+#define R_2  V11
+#define R_3  V12
+#define R_4  V13
+#define R5_1 V14
+#define R5_2 V15
+#define R5_3 V16
+#define R5_4 V17
+#define RSAVE_0 R5
+#define RSAVE_1 R6
+#define RSAVE_2 R7
+#define RSAVE_3 R8
+#define RSAVE_4 R9
+#define R5SAVE_1 V28
+#define R5SAVE_2 V29
+#define R5SAVE_3 V30
+#define R5SAVE_4 V31
+
+// message block
+#define F_0 V18
+#define F_1 V19
+#define F_2 V20
+#define F_3 V21
+#define F_4 V22
+
+// accumulator
+#define H_0 V23
+#define H_1 V24
+#define H_2 V25
+#define H_3 V26
+#define H_4 V27
+
+GLOBL ·keyMask<>(SB), RODATA, $16
+DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
+DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
+
+GLOBL ·bswapMask<>(SB), RODATA, $16
+DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
+DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
+
+GLOBL ·constants<>(SB), RODATA, $64
+// MOD26
+DATA ·constants<>+0(SB)/8, $0x3ffffff
+DATA ·constants<>+8(SB)/8, $0x3ffffff
+// EX0
+DATA ·constants<>+16(SB)/8, $0x0006050403020100
+DATA ·constants<>+24(SB)/8, $0x1016151413121110
+// EX1
+DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
+// EX2
+DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
+
+// h = (f*g) % (2**130-5) [partial reduction]
+#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
+	VMLOF  f0, g0, h0        \
+	VMLOF  f0, g1, h1        \
+	VMLOF  f0, g2, h2        \
+	VMLOF  f0, g3, h3        \
+	VMLOF  f0, g4, h4        \
+	VMLOF  f1, g54, T_0      \
+	VMLOF  f1, g0, T_1       \
+	VMLOF  f1, g1, T_2       \
+	VMLOF  f1, g2, T_3       \
+	VMLOF  f1, g3, T_4       \
+	VMALOF f2, g53, h0, h0   \
+	VMALOF f2, g54, h1, h1   \
+	VMALOF f2, g0, h2, h2    \
+	VMALOF f2, g1, h3, h3    \
+	VMALOF f2, g2, h4, h4    \
+	VMALOF f3, g52, T_0, T_0 \
+	VMALOF f3, g53, T_1, T_1 \
+	VMALOF f3, g54, T_2, T_2 \
+	VMALOF f3, g0, T_3, T_3  \
+	VMALOF f3, g1, T_4, T_4  \
+	VMALOF f4, g51, h0, h0   \
+	VMALOF f4, g52, h1, h1   \
+	VMALOF f4, g53, h2, h2   \
+	VMALOF f4, g54, h3, h3   \
+	VMALOF f4, g0, h4, h4    \
+	VAG    T_0, h0, h0       \
+	VAG    T_1, h1, h1       \
+	VAG    T_2, h2, h2       \
+	VAG    T_3, h3, h3       \
+	VAG    T_4, h4, h4
+
+// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
+#define REDUCE(h0, h1, h2, h3, h4) \
+	VESRLG $26, h0, T_0  \
+	VESRLG $26, h3, T_1  \
+	VN     MOD26, h0, h0 \
+	VN     MOD26, h3, h3 \
+	VAG    T_0, h1, h1   \
+	VAG    T_1, h4, h4   \
+	VESRLG $26, h1, T_2  \
+	VESRLG $26, h4, T_3  \
+	VN     MOD26, h1, h1 \
+	VN     MOD26, h4, h4 \
+	VESLG  $2, T_3, T_4  \
+	VAG    T_3, T_4, T_4 \
+	VAG    T_2, h2, h2   \
+	VAG    T_4, h0, h0   \
+	VESRLG $26, h2, T_0  \
+	VESRLG $26, h0, T_1  \
+	VN     MOD26, h2, h2 \
+	VN     MOD26, h0, h0 \
+	VAG    T_0, h3, h3   \
+	VAG    T_1, h1, h1   \
+	VESRLG $26, h3, T_2  \
+	VN     MOD26, h3, h3 \
+	VAG    T_2, h4, h4
+
+// expand in0 into d[0] and in1 into d[1]
+#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
+	VGBM   $0x0707, d1       \ // d1=tmp
+	VPERM  in0, in1, EX2, d4 \
+	VPERM  in0, in1, EX0, d0 \
+	VPERM  in0, in1, EX1, d2 \
+	VN     d1, d4, d4        \
+	VESRLG $26, d0, d1       \
+	VESRLG $30, d2, d3       \
+	VESRLG $4, d2, d2        \
+	VN     MOD26, d0, d0     \
+	VN     MOD26, d1, d1     \
+	VN     MOD26, d2, d2     \
+	VN     MOD26, d3, d3
+
+// pack h4:h0 into h1:h0 (no carry)
+#define PACK(h0, h1, h2, h3, h4) \
+	VESLG $26, h1, h1  \
+	VESLG $26, h3, h3  \
+	VO    h0, h1, h0   \
+	VO    h2, h3, h2   \
+	VESLG $4, h2, h2   \
+	VLEIB $7, $48, h1  \
+	VSLB  h1, h2, h2   \
+	VO    h0, h2, h0   \
+	VLEIB $7, $104, h1 \
+	VSLB  h1, h4, h3   \
+	VO    h3, h0, h0   \
+	VLEIB $7, $24, h1  \
+	VSRLB h1, h4, h1
+
+// if h > 2**130-5 then h -= 2**130-5
+#define MOD(h0, h1, t0, t1, t2) \
+	VZERO t0          \
+	VLEIG $1, $5, t0  \
+	VACCQ h0, t0, t1  \
+	VAQ   h0, t0, t0  \
+	VONE  t2          \
+	VLEIG $1, $-4, t2 \
+	VAQ   t2, t1, t1  \
+	VACCQ h1, t1, t1  \
+	VONE  t2          \
+	VAQ   t2, t1, t1  \
+	VN    h0, t1, t2  \
+	VNC   t0, t1, t1  \
+	VO    t1, t2, h0
+
+// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
+TEXT ·poly1305vx(SB), $0-32
+	// This code processes up to 2 blocks (32 bytes) per iteration
+	// using the algorithm described in:
+	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
+	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+	LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
+
+	// load MOD26, EX0, EX1 and EX2
+	MOVD $·constants<>(SB), R5
+	VLM  (R5), MOD26, EX2
+
+	// setup r
+	VL   (R4), T_0
+	MOVD $·keyMask<>(SB), R6
+	VL   (R6), T_1
+	VN   T_0, T_1, T_0
+	EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
+
+	// setup r*5
+	VLEIG $0, $5, T_0
+	VLEIG $1, $5, T_0
+
+	// store r (for final block)
+	VMLOF T_0, R_1, R5SAVE_1
+	VMLOF T_0, R_2, R5SAVE_2
+	VMLOF T_0, R_3, R5SAVE_3
+	VMLOF T_0, R_4, R5SAVE_4
+	VLGVG $0, R_0, RSAVE_0
+	VLGVG $0, R_1, RSAVE_1
+	VLGVG $0, R_2, RSAVE_2
+	VLGVG $0, R_3, RSAVE_3
+	VLGVG $0, R_4, RSAVE_4
+
+	// skip r**2 calculation
+	CMPBLE R3, $16, skip
+
+	// calculate r**2
+	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
+	REDUCE(H_0, H_1, H_2, H_3, H_4)
+	VLEIG $0, $5, T_0
+	VLEIG $1, $5, T_0
+	VMLOF T_0, H_1, R5_1
+	VMLOF T_0, H_2, R5_2
+	VMLOF T_0, H_3, R5_3
+	VMLOF T_0, H_4, R5_4
+	VLR   H_0, R_0
+	VLR   H_1, R_1
+	VLR   H_2, R_2
+	VLR   H_3, R_3
+	VLR   H_4, R_4
+
+	// initialize h
+	VZERO H_0
+	VZERO H_1
+	VZERO H_2
+	VZERO H_3
+	VZERO H_4
+
+loop:
+	CMPBLE R3, $32, b2
+	VLM    (R2), T_0, T_1
+	SUB    $32, R3
+	MOVD   $32(R2), R2
+	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+	VLEIB  $4, $1, F_4
+	VLEIB  $12, $1, F_4
+
+multiply:
+	VAG    H_0, F_0, F_0
+	VAG    H_1, F_1, F_1
+	VAG    H_2, F_2, F_2
+	VAG    H_3, F_3, F_3
+	VAG    H_4, F_4, F_4
+	MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+	REDUCE(H_0, H_1, H_2, H_3, H_4)
+	CMPBNE R3, $0, loop
+
+finish:
+	// sum vectors
+	VZERO  T_0
+	VSUMQG H_0, T_0, H_0
+	VSUMQG H_1, T_0, H_1
+	VSUMQG H_2, T_0, H_2
+	VSUMQG H_3, T_0, H_3
+	VSUMQG H_4, T_0, H_4
+
+	// h may be >= 2*(2**130-5) so we need to reduce it again
+	REDUCE(H_0, H_1, H_2, H_3, H_4)
+
+	// carry h1->h4
+	VESRLG $26, H_1, T_1
+	VN     MOD26, H_1, H_1
+	VAQ    T_1, H_2, H_2
+	VESRLG $26, H_2, T_2
+	VN     MOD26, H_2, H_2
+	VAQ    T_2, H_3, H_3
+	VESRLG $26, H_3, T_3
+	VN     MOD26, H_3, H_3
+	VAQ    T_3, H_4, H_4
+
+	// h is now < 2*(2**130-5)
+	// pack h into h1 (hi) and h0 (lo)
+	PACK(H_0, H_1, H_2, H_3, H_4)
+
+	// if h > 2**130-5 then h -= 2**130-5
+	MOD(H_0, H_1, T_0, T_1, T_2)
+
+	// h += s
+	MOVD  $·bswapMask<>(SB), R5
+	VL    (R5), T_1
+	VL    16(R4), T_0
+	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
+	VAQ   T_0, H_0, H_0
+	VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
+	VST   H_0, (R1)
+
+	RET
+
+b2:
+	CMPBLE R3, $16, b1
+
+	// 2 blocks remaining
+	SUB    $17, R3
+	VL     (R2), T_0
+	VLL    R3, 16(R2), T_1
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBEQ R3, $16, 2(PC)
+	VLVGB  R3, R0, T_1
+	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+	CMPBNE R3, $16, 2(PC)
+	VLEIB  $12, $1, F_4
+	VLEIB  $4, $1, F_4
+
+	// setup [r²,r]
+	VLVGG $1, RSAVE_0, R_0
+	VLVGG $1, RSAVE_1, R_1
+	VLVGG $1, RSAVE_2, R_2
+	VLVGG $1, RSAVE_3, R_3
+	VLVGG $1, RSAVE_4, R_4
+	VPDI  $0, R5_1, R5SAVE_1, R5_1
+	VPDI  $0, R5_2, R5SAVE_2, R5_2
+	VPDI  $0, R5_3, R5SAVE_3, R5_3
+	VPDI  $0, R5_4, R5SAVE_4, R5_4
+
+	MOVD $0, R3
+	BR   multiply
+
+skip:
+	VZERO H_0
+	VZERO H_1
+	VZERO H_2
+	VZERO H_3
+	VZERO H_4
+
+	CMPBEQ R3, $0, finish
+
+b1:
+	// 1 block remaining
+	SUB    $1, R3
+	VLL    R3, (R2), T_0
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBEQ R3, $16, 2(PC)
+	VLVGB  R3, R0, T_0
+	VZERO  T_1
+	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+	CMPBNE R3, $16, 2(PC)
+	VLEIB  $4, $1, F_4
+	VLEIG  $1, $1, R_0
+	VZERO  R_1
+	VZERO  R_2
+	VZERO  R_3
+	VZERO  R_4
+	VZERO  R5_1
+	VZERO  R5_2
+	VZERO  R5_3
+	VZERO  R5_4
+
+	// setup [r, 1]
+	VLVGG $0, RSAVE_0, R_0
+	VLVGG $0, RSAVE_1, R_1
+	VLVGG $0, RSAVE_2, R_2
+	VLVGG $0, RSAVE_3, R_3
+	VLVGG $0, RSAVE_4, R_4
+	VPDI  $0, R5SAVE_1, R5_1, R5_1
+	VPDI  $0, R5SAVE_2, R5_2, R5_2
+	VPDI  $0, R5SAVE_3, R5_3, R5_3
+	VPDI  $0, R5SAVE_4, R5_4, R5_4
+
+	MOVD $0, R3
+	BR   multiply
+
+TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
+	MOVD  $x-24(SP), R1
+	XC    $24, 0(R1), 0(R1) // clear the storage
+	MOVD  $2, R0            // R0 is the number of double words stored -1
+	WORD  $0xB2B01000       // STFLE 0(R1)
+	XOR   R0, R0            // reset the value of R0
+	MOVBZ z-8(SP), R1
+	AND   $0x40, R1
+	BEQ   novector
+
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB  $0, $0xF, V16
+	VLGVB  $0, V16, R1
+	CMPBNE R1, $0xF, novector
+	MOVB   $1, ret+0(FP)      // have vx
+	RET
+
+novector:
+	MOVB $0, ret+0(FP) // no vx
+	RET
--- a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
@@ -0,0 +1,931 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x,go1.11,!gccgo,!appengine
+
+#include "textflag.h"
+
+// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
+
+// constants
+#define EX0   V1
+#define EX1   V2
+#define EX2   V3
+
+// temporaries
+#define T_0 V4
+#define T_1 V5
+#define T_2 V6
+#define T_3 V7
+#define T_4 V8
+#define T_5 V9
+#define T_6 V10
+#define T_7 V11
+#define T_8 V12
+#define T_9 V13
+#define T_10 V14
+
+// r**2 & r**4
+#define R_0  V15
+#define R_1  V16
+#define R_2  V17
+#define R5_1 V18
+#define R5_2 V19
+// key (r)
+#define RSAVE_0 R7
+#define RSAVE_1 R8
+#define RSAVE_2 R9
+#define R5SAVE_1 R10
+#define R5SAVE_2 R11
+
+// message block
+#define M0 V20
+#define M1 V21
+#define M2 V22
+#define M3 V23
+#define M4 V24
+#define M5 V25
+
+// accumulator
+#define H0_0 V26
+#define H1_0 V27
+#define H2_0 V28
+#define H0_1 V29
+#define H1_1 V30
+#define H2_1 V31
+
+GLOBL ·keyMask<>(SB), RODATA, $16
+DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
+DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
+
+GLOBL ·bswapMask<>(SB), RODATA, $16
+DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
+DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
+
+GLOBL ·constants<>(SB), RODATA, $48
+// EX0
+DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
+DATA ·constants<>+8(SB)/8, $0x0000050403020100
+// EX1
+DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
+DATA ·constants<>+24(SB)/8, $0x00000a0908070605
+// EX2
+DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
+DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
+
+GLOBL ·c<>(SB), RODATA, $48
+// EX0
+DATA ·c<>+0(SB)/8, $0x0000050403020100
+DATA ·c<>+8(SB)/8, $0x0000151413121110
+// EX1
+DATA ·c<>+16(SB)/8, $0x00000a0908070605
+DATA ·c<>+24(SB)/8, $0x00001a1918171615
+// EX2
+DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
+DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
+
+GLOBL ·reduce<>(SB), RODATA, $32
+// 44 bit
+DATA ·reduce<>+0(SB)/8, $0x0
+DATA ·reduce<>+8(SB)/8, $0xfffffffffff
+// 42 bit
+DATA ·reduce<>+16(SB)/8, $0x0
+DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
+
+// h = (f*g) % (2**130-5) [partial reduction]
+// uses T_0...T_9 temporary registers
+// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
+// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
+#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
+	\ // Eliminate the dependency for the last 2 VMSLs
+	VMSLG m02_0, r_2, m4_2, m4_2                       \
+	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
+	VMSLG m02_0, r_0, m4_0, m4_0                       \
+	VMSLG m02_1, r5_2, V0, T_0                         \
+	VMSLG m02_0, r_1, m4_1, m4_1                       \
+	VMSLG m02_1, r_0, V0, T_1                          \
+	VMSLG m02_1, r_1, V0, T_2                          \
+	VMSLG m02_2, r5_1, V0, T_3                         \
+	VMSLG m02_2, r5_2, V0, T_4                         \
+	VMSLG m13_0, r_0, m5_0, m5_0                       \
+	VMSLG m13_1, r5_2, V0, T_5                         \
+	VMSLG m13_0, r_1, m5_1, m5_1                       \
+	VMSLG m13_1, r_0, V0, T_6                          \
+	VMSLG m13_1, r_1, V0, T_7                          \
+	VMSLG m13_2, r5_1, V0, T_8                         \
+	VMSLG m13_2, r5_2, V0, T_9                         \
+	VMSLG m02_2, r_0, m4_2, m4_2                       \
+	VMSLG m13_2, r_0, m5_2, m5_2                       \
+	VAQ   m4_0, T_0, m02_0                             \
+	VAQ   m4_1, T_1, m02_1                             \
+	VAQ   m5_0, T_5, m13_0                             \
+	VAQ   m5_1, T_6, m13_1                             \
+	VAQ   m02_0, T_3, m02_0                            \
+	VAQ   m02_1, T_4, m02_1                            \
+	VAQ   m13_0, T_8, m13_0                            \
+	VAQ   m13_1, T_9, m13_1                            \
+	VAQ   m4_2, T_2, m02_2                             \
+	VAQ   m5_2, T_7, m13_2                             \
+
+// SQUARE uses three limbs of r and r_2*5 to output square of r
+// uses T_1, T_5 and T_7 temporary registers
+// input: r_0, r_1, r_2, r5_2
+// temp: TEMP0, TEMP1, TEMP2
+// output: p0, p1, p2
+#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
+	VMSLG r_0, r_0, p0, p0     \
+	VMSLG r_1, r5_2, V0, TEMP0 \
+	VMSLG r_2, r5_2, p1, p1    \
+	VMSLG r_0, r_1, V0, TEMP1  \
+	VMSLG r_1, r_1, p2, p2     \
+	VMSLG r_0, r_2, V0, TEMP2  \
+	VAQ   TEMP0, p0, p0        \
+	VAQ   TEMP1, p1, p1        \
+	VAQ   TEMP2, p2, p2        \
+	VAQ   TEMP0, p0, p0        \
+	VAQ   TEMP1, p1, p1        \
+	VAQ   TEMP2, p2, p2        \
+
+// carry h0->h1->h2->h0 || h3->h4->h5->h3
+// uses T_2, T_4, T_5, T_7, T_8, T_9
+//       t6,  t7,  t8,  t9, t10, t11
+// input: h0, h1, h2, h3, h4, h5
+// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
+// output: h0, h1, h2, h3, h4, h5
+#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
+	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
+	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
+	VREPIB $4, t8         \ // 4 bit shift mask
+	VREPIB $2, t11        \ // 2 bit shift mask
+	VSRLB  t10, h0, t0    \ // h0 byte shift
+	VSRLB  t10, h1, t1    \ // h1 byte shift
+	VSRLB  t10, h2, t2    \ // h2 byte shift
+	VSRLB  t10, h3, t3    \ // h3 byte shift
+	VSRLB  t10, h4, t4    \ // h4 byte shift
+	VSRLB  t10, h5, t5    \ // h5 byte shift
+	VSRL   t8, t0, t0     \ // h0 bit shift
+	VSRL   t8, t1, t1     \ // h2 bit shift
+	VSRL   t11, t2, t2    \ // h2 bit shift
+	VSRL   t8, t3, t3     \ // h3 bit shift
+	VSRL   t8, t4, t4     \ // h4 bit shift
+	VESLG  $2, t2, t9     \ // h2 carry x5
+	VSRL   t11, t5, t5    \ // h5 bit shift
+	VN     t6, h0, h0     \ // h0 clear carry
+	VAQ    t2, t9, t2     \ // h2 carry x5
+	VESLG  $2, t5, t9     \ // h5 carry x5
+	VN     t6, h1, h1     \ // h1 clear carry
+	VN     t7, h2, h2     \ // h2 clear carry
+	VAQ    t5, t9, t5     \ // h5 carry x5
+	VN     t6, h3, h3     \ // h3 clear carry
+	VN     t6, h4, h4     \ // h4 clear carry
+	VN     t7, h5, h5     \ // h5 clear carry
+	VAQ    t0, h1, h1     \ // h0->h1
+	VAQ    t3, h4, h4     \ // h3->h4
+	VAQ    t1, h2, h2     \ // h1->h2
+	VAQ    t4, h5, h5     \ // h4->h5
+	VAQ    t2, h0, h0     \ // h2->h0
+	VAQ    t5, h3, h3     \ // h5->h3
+	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
+	VREPG  $1, t7, t7     \
+	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
+	VSLDB  $8, h1, h1, h1 \
+	VSLDB  $8, h2, h2, h2 \
+	VO     h0, h3, h3     \
+	VO     h1, h4, h4     \
+	VO     h2, h5, h5     \
+	VESRLG $44, h3, t0    \ // 44 bit shift right
+	VESRLG $44, h4, t1    \
+	VESRLG $42, h5, t2    \
+	VN     t6, h3, h3     \ // clear carry bits
+	VN     t6, h4, h4     \
+	VN     t7, h5, h5     \
+	VESLG  $2, t2, t9     \ // multiply carry by 5
+	VAQ    t9, t2, t2     \
+	VAQ    t0, h4, h4     \
+	VAQ    t1, h5, h5     \
+	VAQ    t2, h3, h3     \
+
+// carry h0->h1->h2->h0
+// input: h0, h1, h2
+// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
+// output: h0, h1, h2
+#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
+	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
+	VREPIB $4, t4        \ // 4 bit shift mask
+	VREPIB $2, t7        \ // 2 bit shift mask
+	VGBM   $0x003F, t5   \ // mask to clear carry bits
+	VSRLB  t3, h0, t0    \
+	VSRLB  t3, h1, t1    \
+	VSRLB  t3, h2, t2    \
+	VESRLG $4, t5, t5    \ // 44 bit clear mask
+	VSRL   t4, t0, t0    \
+	VSRL   t4, t1, t1    \
+	VSRL   t7, t2, t2    \
+	VESRLG $2, t5, t6    \ // 42 bit clear mask
+	VESLG  $2, t2, t8    \
+	VAQ    t8, t2, t2    \
+	VN     t5, h0, h0    \
+	VN     t5, h1, h1    \
+	VN     t6, h2, h2    \
+	VAQ    t0, h1, h1    \
+	VAQ    t1, h2, h2    \
+	VAQ    t2, h0, h0    \
+	VSRLB  t3, h0, t0    \
+	VSRLB  t3, h1, t1    \
+	VSRLB  t3, h2, t2    \
+	VSRL   t4, t0, t0    \
+	VSRL   t4, t1, t1    \
+	VSRL   t7, t2, t2    \
+	VN     t5, h0, h0    \
+	VN     t5, h1, h1    \
+	VESLG  $2, t2, t8    \
+	VN     t6, h2, h2    \
+	VAQ    t0, h1, h1    \
+	VAQ    t8, t2, t2    \
+	VAQ    t1, h2, h2    \
+	VAQ    t2, h0, h0    \
+
+// expands two message blocks into the lower halfs of the d registers
+// moves the contents of the d registers into upper halfs
+// input: in1, in2, d0, d1, d2, d3, d4, d5
+// temp: TEMP0, TEMP1, TEMP2, TEMP3
+// output: d0, d1, d2, d3, d4, d5
+#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
+	VGBM   $0xff3f, TEMP0      \
+	VGBM   $0xff1f, TEMP1      \
+	VESLG  $4, d1, TEMP2       \
+	VESLG  $4, d4, TEMP3       \
+	VESRLG $4, TEMP0, TEMP0    \
+	VPERM  in1, d0, EX0, d0    \
+	VPERM  in2, d3, EX0, d3    \
+	VPERM  in1, d2, EX2, d2    \
+	VPERM  in2, d5, EX2, d5    \
+	VPERM  in1, TEMP2, EX1, d1 \
+	VPERM  in2, TEMP3, EX1, d4 \
+	VN     TEMP0, d0, d0       \
+	VN     TEMP0, d3, d3       \
+	VESRLG $4, d1, d1          \
+	VESRLG $4, d4, d4          \
+	VN     TEMP1, d2, d2       \
+	VN     TEMP1, d5, d5       \
+	VN     TEMP0, d1, d1       \
+	VN     TEMP0, d4, d4       \
+
+// expands one message block into the lower halfs of the d registers
+// moves the contents of the d registers into upper halfs
+// input: in, d0, d1, d2
+// temp: TEMP0, TEMP1, TEMP2
+// output: d0, d1, d2
+#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
+	VGBM   $0xff3f, TEMP0     \
+	VESLG  $4, d1, TEMP2      \
+	VGBM   $0xff1f, TEMP1     \
+	VPERM  in, d0, EX0, d0    \
+	VESRLG $4, TEMP0, TEMP0   \
+	VPERM  in, d2, EX2, d2    \
+	VPERM  in, TEMP2, EX1, d1 \
+	VN     TEMP0, d0, d0      \
+	VN     TEMP1, d2, d2      \
+	VESRLG $4, d1, d1         \
+	VN     TEMP0, d1, d1      \
+
+// pack h2:h0 into h1:h0 (no carry)
+// input: h0, h1, h2
+// output: h0, h1, h2
+#define PACK(h0, h1, h2) \
+	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
+	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
+	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
+	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
+	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
+	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
+	VLEIG  $0, $0, h2  \ // clear upper half of h2
+	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
+	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
+	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
+	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
+	VLEIG  $0, $0, h1  \ // clear upper half of h1
+
+// if h > 2**130-5 then h -= 2**130-5
+// input: h0, h1
+// temp: t0, t1, t2
+// output: h0
+#define MOD(h0, h1, t0, t1, t2) \
+	VZERO t0          \
+	VLEIG $1, $5, t0  \
+	VACCQ h0, t0, t1  \
+	VAQ   h0, t0, t0  \
+	VONE  t2          \
+	VLEIG $1, $-4, t2 \
+	VAQ   t2, t1, t1  \
+	VACCQ h1, t1, t1  \
+	VONE  t2          \
+	VAQ   t2, t1, t1  \
+	VN    h0, t1, t2  \
+	VNC   t0, t1, t1  \
+	VO    t1, t2, h0  \
+
+// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
+TEXT ·poly1305vmsl(SB), $0-32
+	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
+	// using the algorithm described in:
+	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
+	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+	// And as moddified for VMSL as described in
+	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
+	// O'Farrell et al, CASCON 2017, p48-55
+	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
+
+	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
+	VZERO V0                // c
+
+	// load EX0, EX1 and EX2
+	MOVD $·constants<>(SB), R5
+	VLM  (R5), EX0, EX2        // c
+
+	// setup r
+	VL    (R4), T_0
+	MOVD  $·keyMask<>(SB), R6
+	VL    (R6), T_1
+	VN    T_0, T_1, T_0
+	VZERO T_2                 // limbs for r
+	VZERO T_3
+	VZERO T_4
+	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
+
+	// T_2, T_3, T_4: [0, r]
+
+	// setup r*20
+	VLEIG $0, $0, T_0
+	VLEIG $1, $20, T_0       // T_0: [0, 20]
+	VZERO T_5
+	VZERO T_6
+	VMSLG T_0, T_3, T_5, T_5
+	VMSLG T_0, T_4, T_6, T_6
+
+	// store r for final block in GR
+	VLGVG $1, T_2, RSAVE_0  // c
+	VLGVG $1, T_3, RSAVE_1  // c
+	VLGVG $1, T_4, RSAVE_2  // c
+	VLGVG $1, T_5, R5SAVE_1 // c
+	VLGVG $1, T_6, R5SAVE_2 // c
+
+	// initialize h
+	VZERO H0_0
+	VZERO H1_0
+	VZERO H2_0
+	VZERO H0_1
+	VZERO H1_1
+	VZERO H2_1
+
+	// initialize pointer for reduce constants
+	MOVD $·reduce<>(SB), R12
+
+	// calculate r**2 and 20*(r**2)
+	VZERO R_0
+	VZERO R_1
+	VZERO R_2
+	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
+	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
+	VZERO R5_1
+	VZERO R5_2
+	VMSLG T_0, R_1, R5_1, R5_1
+	VMSLG T_0, R_2, R5_2, R5_2
+
+	// skip r**4 calculation if 3 blocks or less
+	CMPBLE R3, $48, b4
+
+	// calculate r**4 and 20*(r**4)
+	VZERO T_8
+	VZERO T_9
+	VZERO T_10
+	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
+	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
+	VZERO T_2
+	VZERO T_3
+	VMSLG T_0, T_9, T_2, T_2
+	VMSLG T_0, T_10, T_3, T_3
+
+	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
+	VSLDB $8, T_8, T_8, T_8
+	VSLDB $8, T_9, T_9, T_9
+	VSLDB $8, T_10, T_10, T_10
+	VSLDB $8, T_2, T_2, T_2
+	VSLDB $8, T_3, T_3, T_3
+
+	VO T_8, R_0, R_0
+	VO T_9, R_1, R_1
+	VO T_10, R_2, R_2
+	VO T_2, R5_1, R5_1
+	VO T_3, R5_2, R5_2
+
+	CMPBLE R3, $80, load // less than or equal to 5 blocks in message
+
+	// 6(or 5+1) blocks
+	SUB    $81, R3
+	VLM    (R2), M0, M4
+	VLL    R3, 80(R2), M5
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBGE R3, $16, 2(PC)
+	VLVGB  R3, R0, M5
+	MOVD   $96(R2), R2
+	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
+	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
+	VLEIB  $2, $1, H2_0
+	VLEIB  $2, $1, H2_1
+	VLEIB  $10, $1, H2_0
+	VLEIB  $10, $1, H2_1
+
+	VZERO  M0
+	VZERO  M1
+	VZERO  M2
+	VZERO  M3
+	VZERO  T_4
+	VZERO  T_10
+	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
+	VLR    T_4, M4
+	VLEIB  $10, $1, M2
+	CMPBLT R3, $16, 2(PC)
+	VLEIB  $10, $1, T_10
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
+	VMRHG  V0, H0_1, H0_0
+	VMRHG  V0, H1_1, H1_0
+	VMRHG  V0, H2_1, H2_0
+	VMRLG  V0, H0_1, H0_1
+	VMRLG  V0, H1_1, H1_1
+	VMRLG  V0, H2_1, H2_1
+
+	SUB    $16, R3
+	CMPBLE R3, $0, square
+
+load:
+	// load EX0, EX1 and EX2
+	MOVD $·c<>(SB), R5
+	VLM  (R5), EX0, EX2
+
+loop:
+	CMPBLE R3, $64, add // b4	// last 4 or less blocks left
+
+	// next 4 full blocks
+	VLM  (R2), M2, M5
+	SUB  $64, R3
+	MOVD $64(R2), R2
+	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
+
+	// expacc in-lined to create [m2, m3] limbs
+	VGBM   $0x3f3f, T_0     // 44 bit clear mask
+	VGBM   $0x1f1f, T_1     // 40 bit clear mask
+	VPERM  M2, M3, EX0, T_3
+	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
+	VPERM  M2, M3, EX1, T_4
+	VPERM  M2, M3, EX2, T_5
+	VN     T_0, T_3, T_3
+	VESRLG $4, T_4, T_4
+	VN     T_1, T_5, T_5
+	VN     T_0, T_4, T_4
+	VMRHG  H0_1, T_3, H0_0
+	VMRHG  H1_1, T_4, H1_0
+	VMRHG  H2_1, T_5, H2_0
+	VMRLG  H0_1, T_3, H0_1
+	VMRLG  H1_1, T_4, H1_1
+	VMRLG  H2_1, T_5, H2_1
+	VLEIB  $10, $1, H2_0
+	VLEIB  $10, $1, H2_1
+	VPERM  M4, M5, EX0, T_3
+	VPERM  M4, M5, EX1, T_4
+	VPERM  M4, M5, EX2, T_5
+	VN     T_0, T_3, T_3
+	VESRLG $4, T_4, T_4
+	VN     T_1, T_5, T_5
+	VN     T_0, T_4, T_4
+	VMRHG  V0, T_3, M0
+	VMRHG  V0, T_4, M1
+	VMRHG  V0, T_5, M2
+	VMRLG  V0, T_3, M3
+	VMRLG  V0, T_4, M4
+	VMRLG  V0, T_5, M5
+	VLEIB  $10, $1, M2
+	VLEIB  $10, $1, M5
+
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	CMPBNE R3, $0, loop
+	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
+	VMRHG  V0, H0_1, H0_0
+	VMRHG  V0, H1_1, H1_0
+	VMRHG  V0, H2_1, H2_0
+	VMRLG  V0, H0_1, H0_1
+	VMRLG  V0, H1_1, H1_1
+	VMRLG  V0, H2_1, H2_1
+
+	// load EX0, EX1, EX2
+	MOVD $·constants<>(SB), R5
+	VLM  (R5), EX0, EX2
+
+	// sum vectors
+	VAQ H0_0, H0_1, H0_0
+	VAQ H1_0, H1_1, H1_0
+	VAQ H2_0, H2_1, H2_0
+
+	// h may be >= 2*(2**130-5) so we need to reduce it again
+	// M0...M4 are used as temps here
+	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
+
+next:  // carry h1->h2
+	VLEIB  $7, $0x28, T_1
+	VREPIB $4, T_2
+	VGBM   $0x003F, T_3
+	VESRLG $4, T_3
+
+	// byte shift
+	VSRLB T_1, H1_0, T_4
+
+	// bit shift
+	VSRL T_2, T_4, T_4
+
+	// clear h1 carry bits
+	VN T_3, H1_0, H1_0
+
+	// add carry
+	VAQ T_4, H2_0, H2_0
+
+	// h is now < 2*(2**130-5)
+	// pack h into h1 (hi) and h0 (lo)
+	PACK(H0_0, H1_0, H2_0)
+
+	// if h > 2**130-5 then h -= 2**130-5
+	MOD(H0_0, H1_0, T_0, T_1, T_2)
+
+	// h += s
+	MOVD  $·bswapMask<>(SB), R5
+	VL    (R5), T_1
+	VL    16(R4), T_0
+	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
+	VAQ   T_0, H0_0, H0_0
+	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
+	VST   H0_0, (R1)
+	RET
+
+add:
+	// load EX0, EX1, EX2
+	MOVD $·constants<>(SB), R5
+	VLM  (R5), EX0, EX2
+
+	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
+	VMRHG  V0, H0_1, H0_0
+	VMRHG  V0, H1_1, H1_0
+	VMRHG  V0, H2_1, H2_0
+	VMRLG  V0, H0_1, H0_1
+	VMRLG  V0, H1_1, H1_1
+	VMRLG  V0, H2_1, H2_1
+	CMPBLE R3, $64, b4
+
+b4:
+	CMPBLE R3, $48, b3 // 3 blocks or less
+
+	// 4(3+1) blocks remaining
+	SUB    $49, R3
+	VLM    (R2), M0, M2
+	VLL    R3, 48(R2), M3
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBEQ R3, $16, 2(PC)
+	VLVGB  R3, R0, M3
+	MOVD   $64(R2), R2
+	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
+	VLEIB  $10, $1, H2_0
+	VLEIB  $10, $1, H2_1
+	VZERO  M0
+	VZERO  M1
+	VZERO  M4
+	VZERO  M5
+	VZERO  T_4
+	VZERO  T_10
+	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
+	VLR    T_4, M2
+	VLEIB  $10, $1, M4
+	CMPBNE R3, $16, 2(PC)
+	VLEIB  $10, $1, T_10
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
+	VMRHG  V0, H0_1, H0_0
+	VMRHG  V0, H1_1, H1_0
+	VMRHG  V0, H2_1, H2_0
+	VMRLG  V0, H0_1, H0_1
+	VMRLG  V0, H1_1, H1_1
+	VMRLG  V0, H2_1, H2_1
+	SUB    $16, R3
+	CMPBLE R3, $0, square // this condition must always hold true!
+
+b3:
+	CMPBLE R3, $32, b2
+
+	// 3 blocks remaining
+
+	// setup [r²,r]
+	VSLDB $8, R_0, R_0, R_0
+	VSLDB $8, R_1, R_1, R_1
+	VSLDB $8, R_2, R_2, R_2
+	VSLDB $8, R5_1, R5_1, R5_1
+	VSLDB $8, R5_2, R5_2, R5_2
+
+	VLVGG $1, RSAVE_0, R_0
+	VLVGG $1, RSAVE_1, R_1
+	VLVGG $1, RSAVE_2, R_2
+	VLVGG $1, R5SAVE_1, R5_1
+	VLVGG $1, R5SAVE_2, R5_2
+
+	// setup [h0, h1]
+	VSLDB $8, H0_0, H0_0, H0_0
+	VSLDB $8, H1_0, H1_0, H1_0
+	VSLDB $8, H2_0, H2_0, H2_0
+	VO    H0_1, H0_0, H0_0
+	VO    H1_1, H1_0, H1_0
+	VO    H2_1, H2_0, H2_0
+	VZERO H0_1
+	VZERO H1_1
+	VZERO H2_1
+
+	VZERO M0
+	VZERO M1
+	VZERO M2
+	VZERO M3
+	VZERO M4
+	VZERO M5
+
+	// H*[r**2, r]
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
+
+	SUB    $33, R3
+	VLM    (R2), M0, M1
+	VLL    R3, 32(R2), M2
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBEQ R3, $16, 2(PC)
+	VLVGB  R3, R0, M2
+
+	// H += m0
+	VZERO T_1
+	VZERO T_2
+	VZERO T_3
+	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
+	VLEIB $10, $1, T_3
+	VAG   H0_0, T_1, H0_0
+	VAG   H1_0, T_2, H1_0
+	VAG   H2_0, T_3, H2_0
+
+	VZERO M0
+	VZERO M3
+	VZERO M4
+	VZERO M5
+	VZERO T_10
+
+	// (H+m0)*r
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
+
+	// H += m1
+	VZERO V0
+	VZERO T_1
+	VZERO T_2
+	VZERO T_3
+	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
+	VLEIB $10, $1, T_3
+	VAQ   H0_0, T_1, H0_0
+	VAQ   H1_0, T_2, H1_0
+	VAQ   H2_0, T_3, H2_0
+	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
+
+	// [H, m2] * [r**2, r]
+	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
+	CMPBNE R3, $16, 2(PC)
+	VLEIB  $10, $1, H2_0
+	VZERO  M0
+	VZERO  M1
+	VZERO  M2
+	VZERO  M3
+	VZERO  M4
+	VZERO  M5
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
+	SUB    $16, R3
+	CMPBLE R3, $0, next   // this condition must always hold true!
+
+b2:
+	CMPBLE R3, $16, b1
+
+	// 2 blocks remaining
+
+	// setup [r²,r]
+	VSLDB $8, R_0, R_0, R_0
+	VSLDB $8, R_1, R_1, R_1
+	VSLDB $8, R_2, R_2, R_2
+	VSLDB $8, R5_1, R5_1, R5_1
+	VSLDB $8, R5_2, R5_2, R5_2
+
+	VLVGG $1, RSAVE_0, R_0
+	VLVGG $1, RSAVE_1, R_1
+	VLVGG $1, RSAVE_2, R_2
+	VLVGG $1, R5SAVE_1, R5_1
+	VLVGG $1, R5SAVE_2, R5_2
+
+	// setup [h0, h1]
+	VSLDB $8, H0_0, H0_0, H0_0
+	VSLDB $8, H1_0, H1_0, H1_0
+	VSLDB $8, H2_0, H2_0, H2_0
+	VO    H0_1, H0_0, H0_0
+	VO    H1_1, H1_0, H1_0
+	VO    H2_1, H2_0, H2_0
+	VZERO H0_1
+	VZERO H1_1
+	VZERO H2_1
+
+	VZERO M0
+	VZERO M1
+	VZERO M2
+	VZERO M3
+	VZERO M4
+	VZERO M5
+
+	// H*[r**2, r]
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
+	VMRHG V0, H0_1, H0_0
+	VMRHG V0, H1_1, H1_0
+	VMRHG V0, H2_1, H2_0
+	VMRLG V0, H0_1, H0_1
+	VMRLG V0, H1_1, H1_1
+	VMRLG V0, H2_1, H2_1
+
+	// move h to the left and 0s at the right
+	VSLDB $8, H0_0, H0_0, H0_0
+	VSLDB $8, H1_0, H1_0, H1_0
+	VSLDB $8, H2_0, H2_0, H2_0
+
+	// get message blocks and append 1 to start
+	SUB    $17, R3
+	VL     (R2), M0
+	VLL    R3, 16(R2), M1
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBEQ R3, $16, 2(PC)
+	VLVGB  R3, R0, M1
+	VZERO  T_6
+	VZERO  T_7
+	VZERO  T_8
+	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
+	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
+	VLEIB  $2, $1, T_8
+	CMPBNE R3, $16, 2(PC)
+	VLEIB  $10, $1, T_8
+
+	// add [m0, m1] to h
+	VAG H0_0, T_6, H0_0
+	VAG H1_0, T_7, H1_0
+	VAG H2_0, T_8, H2_0
+
+	VZERO M2
+	VZERO M3
+	VZERO M4
+	VZERO M5
+	VZERO T_10
+	VZERO M0
+
+	// at this point R_0 .. R5_2 look like [r**2, r]
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
+	SUB    $16, R3, R3
+	CMPBLE R3, $0, next
+
+b1:
+	CMPBLE R3, $0, next
+
+	// 1 block remaining
+
+	// setup [r²,r]
+	VSLDB $8, R_0, R_0, R_0
+	VSLDB $8, R_1, R_1, R_1
+	VSLDB $8, R_2, R_2, R_2
+	VSLDB $8, R5_1, R5_1, R5_1
+	VSLDB $8, R5_2, R5_2, R5_2
+
+	VLVGG $1, RSAVE_0, R_0
+	VLVGG $1, RSAVE_1, R_1
+	VLVGG $1, RSAVE_2, R_2
+	VLVGG $1, R5SAVE_1, R5_1
+	VLVGG $1, R5SAVE_2, R5_2
+
+	// setup [h0, h1]
+	VSLDB $8, H0_0, H0_0, H0_0
+	VSLDB $8, H1_0, H1_0, H1_0
+	VSLDB $8, H2_0, H2_0, H2_0
+	VO    H0_1, H0_0, H0_0
+	VO    H1_1, H1_0, H1_0
+	VO    H2_1, H2_0, H2_0
+	VZERO H0_1
+	VZERO H1_1
+	VZERO H2_1
+
+	VZERO M0
+	VZERO M1
+	VZERO M2
+	VZERO M3
+	VZERO M4
+	VZERO M5
+
+	// H*[r**2, r]
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
+
+	// set up [0, m0] limbs
+	SUB    $1, R3
+	VLL    R3, (R2), M0
+	ADD    $1, R3
+	MOVBZ  $1, R0
+	CMPBEQ R3, $16, 2(PC)
+	VLVGB  R3, R0, M0
+	VZERO  T_1
+	VZERO  T_2
+	VZERO  T_3
+	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
+	CMPBNE R3, $16, 2(PC)
+	VLEIB  $10, $1, T_3
+
+	// h+m0
+	VAQ H0_0, T_1, H0_0
+	VAQ H1_0, T_2, H1_0
+	VAQ H2_0, T_3, H2_0
+
+	VZERO M0
+	VZERO M1
+	VZERO M2
+	VZERO M3
+	VZERO M4
+	VZERO M5
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
+
+	BR next
+
+square:
+	// setup [r²,r]
+	VSLDB $8, R_0, R_0, R_0
+	VSLDB $8, R_1, R_1, R_1
+	VSLDB $8, R_2, R_2, R_2
+	VSLDB $8, R5_1, R5_1, R5_1
+	VSLDB $8, R5_2, R5_2, R5_2
+
+	VLVGG $1, RSAVE_0, R_0
+	VLVGG $1, RSAVE_1, R_1
+	VLVGG $1, RSAVE_2, R_2
+	VLVGG $1, R5SAVE_1, R5_1
+	VLVGG $1, R5SAVE_2, R5_2
+
+	// setup [h0, h1]
+	VSLDB $8, H0_0, H0_0, H0_0
+	VSLDB $8, H1_0, H1_0, H1_0
+	VSLDB $8, H2_0, H2_0, H2_0
+	VO    H0_1, H0_0, H0_0
+	VO    H1_1, H1_0, H1_0
+	VO    H2_1, H2_0, H2_0
+	VZERO H0_1
+	VZERO H1_1
+	VZERO H2_1
+
+	VZERO M0
+	VZERO M1
+	VZERO M2
+	VZERO M3
+	VZERO M4
+	VZERO M5
+
+	// (h0*r**2) + (h1*r)
+	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
+	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
+	BR next
+
+TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1
+	MOVD  $x-24(SP), R1
+	XC    $24, 0(R1), 0(R1) // clear the storage
+	MOVD  $2, R0            // R0 is the number of double words stored -1
+	WORD  $0xB2B01000       // STFLE 0(R1)
+	XOR   R0, R0            // reset the value of R0
+	MOVBZ z-8(SP), R1
+	AND   $0x01, R1
+	BEQ   novmsl
+
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB  $0, $0xF, V16
+	VLGVB  $0, V16, R1
+	CMPBNE R1, $0xF, novmsl
+	MOVB   $1, ret+0(FP)    // have vx
+	RET
+
+novmsl:
+	MOVB $0, ret+0(FP) // no vx
+	RET
--- a/vendor/golang.org/x/crypto/ssh/terminal/util.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/util.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build darwin dragonfly freebsd linux,!appengine netbsd openbsd
+// +build aix darwin dragonfly freebsd linux,!appengine netbsd openbsd

 // Package terminal provides support functions for dealing with terminals, as
 // commonly found on UNIX systems.
@@ -25,7 +25,7 @@ type State struct {
 	termios unix.Termios
 }

-// IsTerminal returns true if the given file descriptor is a terminal.
+// IsTerminal returns whether the given file descriptor is a terminal.
 func IsTerminal(fd int) bool {
 	_, err := unix.IoctlGetTermios(fd, ioctlReadTermios)
 	return err == nil
@@ -108,9 +108,7 @@ func ReadPassword(fd int) ([]byte, error) {
 		return nil, err
 	}

-	defer func() {
-		unix.IoctlSetTermios(fd, ioctlWriteTermios, termios)
-	}()
+	defer unix.IoctlSetTermios(fd, ioctlWriteTermios, termios)

 	return readPasswordLine(passwordReader(fd))
 }
--- a/vendor/golang.org/x/crypto/ssh/terminal/util_aix.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/util_aix.go
@@ -0,0 +1,12 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix
+
+package terminal
+
+import "golang.org/x/sys/unix"
+
+const ioctlReadTermios = unix.TCGETS
+const ioctlWriteTermios = unix.TCSETS
--- a/vendor/golang.org/x/crypto/ssh/terminal/util_plan9.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/util_plan9.go
@@ -21,7 +21,7 @@ import (

 type State struct{}

-// IsTerminal returns true if the given file descriptor is a terminal.
+// IsTerminal returns whether the given file descriptor is a terminal.
 func IsTerminal(fd int) bool {
 	return false
 }
--- a/vendor/golang.org/x/crypto/ssh/terminal/util_solaris.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/util_solaris.go
@@ -14,10 +14,10 @@ import (

 // State contains the state of a terminal.
 type State struct {
-	state *unix.Termios
+	termios unix.Termios
 }

-// IsTerminal returns true if the given file descriptor is a terminal.
+// IsTerminal returns whether the given file descriptor is a terminal.
 func IsTerminal(fd int) bool {
 	_, err := unix.IoctlGetTermio(fd, unix.TCGETA)
 	return err == nil
@@ -75,47 +75,43 @@ func ReadPassword(fd int) ([]byte, error) {
 // restored.
 // see http://cr.illumos.org/~webrev/andy_js/1060/
 func MakeRaw(fd int) (*State, error) {
-	oldTermiosPtr, err := unix.IoctlGetTermios(fd, unix.TCGETS)
+	termios, err := unix.IoctlGetTermios(fd, unix.TCGETS)
 	if err != nil {
 		return nil, err
 	}
-	oldTermios := *oldTermiosPtr

-	newTermios := oldTermios
-	newTermios.Iflag &^= syscall.IGNBRK | syscall.BRKINT | syscall.PARMRK | syscall.ISTRIP | syscall.INLCR | syscall.IGNCR | syscall.ICRNL | syscall.IXON
-	newTermios.Oflag &^= syscall.OPOST
-	newTermios.Lflag &^= syscall.ECHO | syscall.ECHONL | syscall.ICANON | syscall.ISIG | syscall.IEXTEN
-	newTermios.Cflag &^= syscall.CSIZE | syscall.PARENB
-	newTermios.Cflag |= syscall.CS8
-	newTermios.Cc[unix.VMIN] = 1
-	newTermios.Cc[unix.VTIME] = 0
+	oldState := State{termios: *termios}

-	if err := unix.IoctlSetTermios(fd, unix.TCSETS, &newTermios); err != nil {
+	termios.Iflag &^= unix.IGNBRK | unix.BRKINT | unix.PARMRK | unix.ISTRIP | unix.INLCR | unix.IGNCR | unix.ICRNL | unix.IXON
+	termios.Oflag &^= unix.OPOST
+	termios.Lflag &^= unix.ECHO | unix.ECHONL | unix.ICANON | unix.ISIG | unix.IEXTEN
+	termios.Cflag &^= unix.CSIZE | unix.PARENB
+	termios.Cflag |= unix.CS8
+	termios.Cc[unix.VMIN] = 1
+	termios.Cc[unix.VTIME] = 0
+
+	if err := unix.IoctlSetTermios(fd, unix.TCSETS, termios); err != nil {
 		return nil, err
 	}

-	return &State{
-		state: oldTermiosPtr,
-	}, nil
+	return &oldState, nil
 }

 // Restore restores the terminal connected to the given file descriptor to a
 // previous state.
 func Restore(fd int, oldState *State) error {
-	return unix.IoctlSetTermios(fd, unix.TCSETS, oldState.state)
+	return unix.IoctlSetTermios(fd, unix.TCSETS, &oldState.termios)
 }

 // GetState returns the current state of a terminal which may be useful to
 // restore the terminal after a signal.
 func GetState(fd int) (*State, error) {
-	oldTermiosPtr, err := unix.IoctlGetTermios(fd, unix.TCGETS)
+	termios, err := unix.IoctlGetTermios(fd, unix.TCGETS)
 	if err != nil {
 		return nil, err
 	}

-	return &State{
-		state: oldTermiosPtr,
-	}, nil
+	return &State{termios: *termios}, nil
 }

 // GetSize returns the dimensions of the given terminal.
--- a/vendor/golang.org/x/crypto/ssh/terminal/util_windows.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/util_windows.go
@@ -26,7 +26,7 @@ type State struct {
 	mode uint32
 }

-// IsTerminal returns true if the given file descriptor is a terminal.
+// IsTerminal returns whether the given file descriptor is a terminal.
 func IsTerminal(fd int) bool {
 	var st uint32
 	err := windows.GetConsoleMode(windows.Handle(fd), &st)
@@ -89,9 +89,15 @@ func ReadPassword(fd int) ([]byte, error) {
 		return nil, err
 	}

-	defer func() {
-		windows.SetConsoleMode(windows.Handle(fd), old)
-	}()
+	defer windows.SetConsoleMode(windows.Handle(fd), old)

-	return readPasswordLine(os.NewFile(uintptr(fd), "stdin"))
+	var h windows.Handle
+	p, _ := windows.GetCurrentProcess()
+	if err := windows.DuplicateHandle(p, windows.Handle(fd), p, &h, 0, false, windows.DUPLICATE_SAME_ACCESS); err != nil {
+		return nil, err
+	}
+
+	f := os.NewFile(uintptr(h), "stdin")
+	defer f.Close()
+	return readPasswordLine(f)
 }
--- a/vendor/golang.org/x/sys/cpu/cpu.go
+++ b/vendor/golang.org/x/sys/cpu/cpu.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cpu implements processor feature detection for
+// various CPU architectures.
+package cpu
+
+// X86 contains the supported CPU features of the
+// current X86/AMD64 platform. If the current platform
+// is not X86/AMD64 then all feature flags are false.
+//
+// X86 is padded to avoid false sharing. Further the HasAVX
+// and HasAVX2 are only set if the OS supports XMM and YMM
+// registers in addition to the CPUID feature bit being set.
+var X86 struct {
+	_            [cacheLineSize]byte
+	HasAES       bool // AES hardware implementation (AES NI)
+	HasADX       bool // Multi-precision add-carry instruction extensions
+	HasAVX       bool // Advanced vector extension
+	HasAVX2      bool // Advanced vector extension 2
+	HasBMI1      bool // Bit manipulation instruction set 1
+	HasBMI2      bool // Bit manipulation instruction set 2
+	HasERMS      bool // Enhanced REP for MOVSB and STOSB
+	HasFMA       bool // Fused-multiply-add instructions
+	HasOSXSAVE   bool // OS supports XSAVE/XRESTOR for saving/restoring XMM registers.
+	HasPCLMULQDQ bool // PCLMULQDQ instruction - most often used for AES-GCM
+	HasPOPCNT    bool // Hamming weight instruction POPCNT.
+	HasSSE2      bool // Streaming SIMD extension 2 (always available on amd64)
+	HasSSE3      bool // Streaming SIMD extension 3
+	HasSSSE3     bool // Supplemental streaming SIMD extension 3
+	HasSSE41     bool // Streaming SIMD extension 4 and 4.1
+	HasSSE42     bool // Streaming SIMD extension 4 and 4.2
+	_            [cacheLineSize]byte
+}
--- a/vendor/golang.org/x/sys/cpu/cpu_arm.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_arm.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const cacheLineSize = 32
--- a/vendor/golang.org/x/sys/cpu/cpu_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const cacheLineSize = 64
--- a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go
@@ -0,0 +1,9 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+package cpu
+
+const cacheLineSize = 32
--- a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go
@@ -0,0 +1,9 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+package cpu
+
+const cacheLineSize = 32
--- a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go
@@ -0,0 +1,9 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+package cpu
+
+const cacheLineSize = 128
--- a/vendor/golang.org/x/sys/cpu/cpu_s390x.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_s390x.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const cacheLineSize = 256
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_x86.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+package cpu
+
+const cacheLineSize = 64
+
+// cpuid is implemented in cpu_x86.s.
+func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+
+// xgetbv with ecx = 0 is implemented in cpu_x86.s.
+func xgetbv() (eax, edx uint32)
+
+func init() {
+	maxID, _, _, _ := cpuid(0, 0)
+
+	if maxID < 1 {
+		return
+	}
+
+	_, _, ecx1, edx1 := cpuid(1, 0)
+	X86.HasSSE2 = isSet(26, edx1)
+
+	X86.HasSSE3 = isSet(0, ecx1)
+	X86.HasPCLMULQDQ = isSet(1, ecx1)
+	X86.HasSSSE3 = isSet(9, ecx1)
+	X86.HasFMA = isSet(12, ecx1)
+	X86.HasSSE41 = isSet(19, ecx1)
+	X86.HasSSE42 = isSet(20, ecx1)
+	X86.HasPOPCNT = isSet(23, ecx1)
+	X86.HasAES = isSet(25, ecx1)
+	X86.HasOSXSAVE = isSet(27, ecx1)
+
+	osSupportsAVX := false
+	// For XGETBV, OSXSAVE bit is required and sufficient.
+	if X86.HasOSXSAVE {
+		eax, _ := xgetbv()
+		// Check if XMM and YMM registers have OS support.
+		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
+	}
+
+	X86.HasAVX = isSet(28, ecx1) && osSupportsAVX
+
+	if maxID < 7 {
+		return
+	}
+
+	_, ebx7, _, _ := cpuid(7, 0)
+	X86.HasBMI1 = isSet(3, ebx7)
+	X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX
+	X86.HasBMI2 = isSet(8, ebx7)
+	X86.HasERMS = isSet(9, ebx7)
+	X86.HasADX = isSet(19, ebx7)
+}
+
+func isSet(bitpos uint, value uint32) bool {
+	return value&(1<<bitpos) != 0
+}
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.s
+++ b/vendor/golang.org/x/sys/cpu/cpu_x86.s
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+#include "textflag.h"
+
+// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+	MOVL eaxArg+0(FP), AX
+	MOVL ecxArg+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv() (eax, edx uint32)
+TEXT ·xgetbv(SB),NOSPLIT,$0-8
+	MOVL $0, CX
+	XGETBV
+	MOVL AX, eax+0(FP)
+	MOVL DX, edx+4(FP)
+	RET